1 /* 2 * Copyright 2006-2010, Haiku, Inc. All Rights Reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Andrew Galante, haiku.galante@gmail.com 7 * Axel Dörfler, axeld@pinc-software.de 8 * Hugo Santos, hugosantos@gmail.com 9 */ 10 11 12 #include "TCPEndpoint.h" 13 14 #include <netinet/in.h> 15 #include <netinet/ip.h> 16 #include <netinet/tcp.h> 17 #include <new> 18 #include <signal.h> 19 #include <stdlib.h> 20 #include <string.h> 21 22 #include <KernelExport.h> 23 #include <Select.h> 24 25 #include <net_buffer.h> 26 #include <net_datalink.h> 27 #include <net_stat.h> 28 #include <NetBufferUtilities.h> 29 #include <NetUtilities.h> 30 31 #include <lock.h> 32 #include <tracing.h> 33 #include <util/AutoLock.h> 34 #include <util/list.h> 35 36 #include "EndpointManager.h" 37 38 39 // References: 40 // - RFC 793 - Transmission Control Protocol 41 // - RFC 813 - Window and Acknowledgement Strategy in TCP 42 // - RFC 1337 - TIME_WAIT Assassination Hazards in TCP 43 // 44 // Things this implementation currently doesn't implement: 45 // - TCP Slow Start, Congestion Avoidance, Fast Retransmit, and Fast Recovery, 46 // RFC 2001, RFC 2581, RFC 3042 47 // - NewReno Modification to TCP's Fast Recovery, RFC 2582 48 // - Explicit Congestion Notification (ECN), RFC 3168 49 // - SYN-Cache 50 // - TCP Extensions for High Performance, RFC 1323 51 // - SACK, Selective Acknowledgment - RFC 2018, RFC 2883, RFC 3517 52 // - Forward RTO-Recovery, RFC 4138 53 // - Time-Wait hash instead of keeping sockets alive 54 55 #define PrintAddress(address) \ 56 AddressString(Domain(), address, true).Data() 57 58 //#define TRACE_TCP 59 //#define PROBE_TCP 60 61 #ifdef TRACE_TCP 62 // the space before ', ##args' is important in order for this to work with cpp 2.95 63 # define TRACE(format, args...) dprintf("%ld: TCP [%llu] %p (%12s) " \ 64 format "\n", find_thread(NULL), system_time(), this, \ 65 name_for_state(fState) , ##args) 66 #else 67 # define TRACE(args...) do { } while (0) 68 #endif 69 70 #ifdef PROBE_TCP 71 # define PROBE(buffer, window) \ 72 dprintf("TCP PROBE %llu %s %s %ld snxt %lu suna %lu cw %lu sst %lu win %lu swin %lu smax-suna %lu savail %lu sqused %lu rto %llu\n", \ 73 system_time(), PrintAddress(buffer->source), \ 74 PrintAddress(buffer->destination), buffer->size, fSendNext.Number(), \ 75 fSendUnacknowledged.Number(), fCongestionWindow, fSlowStartThreshold, \ 76 window, fSendWindow, (fSendMax - fSendUnacknowledged).Number(), \ 77 fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout) 78 #else 79 # define PROBE(buffer, window) do { } while (0) 80 #endif 81 82 #if TCP_TRACING 83 namespace TCPTracing { 84 85 class Receive : public AbstractTraceEntry { 86 public: 87 Receive(TCPEndpoint* endpoint, tcp_segment_header& segment, uint32 window, 88 net_buffer* buffer) 89 : 90 fEndpoint(endpoint), 91 fBuffer(buffer), 92 fBufferSize(buffer->size), 93 fSequence(segment.sequence), 94 fAcknowledge(segment.acknowledge), 95 fWindow(window), 96 fState(endpoint->State()), 97 fFlags(segment.flags) 98 { 99 Initialized(); 100 } 101 102 virtual void AddDump(TraceOutput& out) 103 { 104 out.Print("tcp:%p (%12s) receive buffer %p (%lu bytes), flags %x, " 105 "seq %lu, ack %lu, wnd %lu", fEndpoint, name_for_state(fState), 106 fBuffer, fBufferSize, fFlags, fSequence, fAcknowledge, fWindow); 107 } 108 109 protected: 110 TCPEndpoint* fEndpoint; 111 net_buffer* fBuffer; 112 uint32 fBufferSize; 113 uint32 fSequence; 114 uint32 fAcknowledge; 115 uint32 fWindow; 116 tcp_state fState; 117 uint8 fFlags; 118 }; 119 120 class Send : public AbstractTraceEntry { 121 public: 122 Send(TCPEndpoint* endpoint, tcp_segment_header& segment, net_buffer* buffer, 123 tcp_sequence firstSequence, tcp_sequence lastSequence) 124 : 125 fEndpoint(endpoint), 126 fBuffer(buffer), 127 fBufferSize(buffer->size), 128 fSequence(segment.sequence), 129 fAcknowledge(segment.acknowledge), 130 fFirstSequence(firstSequence.Number()), 131 fLastSequence(lastSequence.Number()), 132 fState(endpoint->State()), 133 fFlags(segment.flags) 134 { 135 Initialized(); 136 } 137 138 virtual void AddDump(TraceOutput& out) 139 { 140 out.Print("tcp:%p (%12s) send buffer %p (%lu bytes), flags %x, " 141 "seq %lu, ack %lu, first %lu, last %lu", 142 fEndpoint, name_for_state(fState), fBuffer, fBufferSize, fFlags, 143 fSequence, fAcknowledge, fFirstSequence, fLastSequence); 144 } 145 146 protected: 147 TCPEndpoint* fEndpoint; 148 net_buffer* fBuffer; 149 uint32 fBufferSize; 150 uint32 fSequence; 151 uint32 fAcknowledge; 152 uint32 fFirstSequence; 153 uint32 fLastSequence; 154 tcp_state fState; 155 uint8 fFlags; 156 }; 157 158 class State : public AbstractTraceEntry { 159 public: 160 State(TCPEndpoint* endpoint) 161 : 162 fEndpoint(endpoint), 163 fState(endpoint->State()) 164 { 165 Initialized(); 166 } 167 168 virtual void AddDump(TraceOutput& out) 169 { 170 out.Print("tcp:%p (%12s) state change", fEndpoint, 171 name_for_state(fState)); 172 } 173 174 protected: 175 TCPEndpoint* fEndpoint; 176 tcp_state fState; 177 }; 178 179 class Spawn : public AbstractTraceEntry { 180 public: 181 Spawn(TCPEndpoint* listeningEndpoint, TCPEndpoint* spawnedEndpoint) 182 : 183 fListeningEndpoint(listeningEndpoint), 184 fSpawnedEndpoint(spawnedEndpoint) 185 { 186 Initialized(); 187 } 188 189 virtual void AddDump(TraceOutput& out) 190 { 191 out.Print("tcp:%p spawns %p", fListeningEndpoint, fSpawnedEndpoint); 192 } 193 194 protected: 195 TCPEndpoint* fListeningEndpoint; 196 TCPEndpoint* fSpawnedEndpoint; 197 }; 198 199 class Error : public AbstractTraceEntry { 200 public: 201 Error(TCPEndpoint* endpoint, const char* error, int32 line) 202 : 203 fEndpoint(endpoint), 204 fLine(line), 205 fError(error), 206 fState(endpoint->State()) 207 { 208 Initialized(); 209 } 210 211 virtual void AddDump(TraceOutput& out) 212 { 213 out.Print("tcp:%p (%12s) error at line %ld: %s", fEndpoint, 214 name_for_state(fState), fLine, fError); 215 } 216 217 protected: 218 TCPEndpoint* fEndpoint; 219 int32 fLine; 220 const char* fError; 221 tcp_state fState; 222 }; 223 224 class Timer : public AbstractTraceEntry { 225 public: 226 Timer(TCPEndpoint* endpoint, const char* which) 227 : 228 fEndpoint(endpoint), 229 fWhich(which), 230 fState(endpoint->State()) 231 { 232 Initialized(); 233 } 234 235 virtual void AddDump(TraceOutput& out) 236 { 237 out.Print("tcp:%p (%12s) %s timer", fEndpoint, 238 name_for_state(fState), fWhich); 239 } 240 241 protected: 242 TCPEndpoint* fEndpoint; 243 const char* fWhich; 244 tcp_state fState; 245 }; 246 247 } // namespace TCPTracing 248 249 # define T(x) new(std::nothrow) TCPTracing::x 250 #else 251 # define T(x) 252 #endif // TCP_TRACING 253 254 // Initial estimate for packet round trip time (RTT) 255 #define TCP_INITIAL_RTT 2000000 256 257 // constants for the fFlags field 258 enum { 259 FLAG_OPTION_WINDOW_SCALE = 0x01, 260 FLAG_OPTION_TIMESTAMP = 0x02, 261 // TODO: Should FLAG_NO_RECEIVE apply as well to received connections? 262 // That is, what is expected from accept() after a shutdown() 263 // is performed on a listen()ing socket. 264 FLAG_NO_RECEIVE = 0x04, 265 FLAG_CLOSED = 0x08, 266 FLAG_DELETE_ON_CLOSE = 0x10, 267 FLAG_LOCAL = 0x20 268 }; 269 270 271 static const int kTimestampFactor = 1024; 272 273 274 static inline bigtime_t 275 absolute_timeout(bigtime_t timeout) 276 { 277 if (timeout == 0 || timeout == B_INFINITE_TIMEOUT) 278 return timeout; 279 280 return timeout + system_time(); 281 } 282 283 284 static inline status_t 285 posix_error(status_t error) 286 { 287 if (error == B_TIMED_OUT) 288 return B_WOULD_BLOCK; 289 290 return error; 291 } 292 293 294 static inline bool 295 in_window(const tcp_sequence& sequence, const tcp_sequence& receiveNext, 296 uint32 receiveWindow) 297 { 298 return sequence >= receiveNext && sequence < (receiveNext + receiveWindow); 299 } 300 301 302 static inline bool 303 segment_in_sequence(const tcp_segment_header& segment, int size, 304 const tcp_sequence& receiveNext, uint32 receiveWindow) 305 { 306 tcp_sequence sequence(segment.sequence); 307 if (size == 0) { 308 if (receiveWindow == 0) 309 return sequence == receiveNext; 310 return in_window(sequence, receiveNext, receiveWindow); 311 } else { 312 if (receiveWindow == 0) 313 return false; 314 return in_window(sequence, receiveNext, receiveWindow) 315 || in_window(sequence + size - 1, receiveNext, receiveWindow); 316 } 317 } 318 319 320 static inline bool 321 is_writable(tcp_state state) 322 { 323 return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED 324 || state == ESTABLISHED || state == FINISH_RECEIVED; 325 } 326 327 328 static inline uint32 tcp_now() 329 { 330 return system_time() / kTimestampFactor; 331 } 332 333 334 static inline uint32 tcp_diff_timestamp(uint32 base) 335 { 336 uint32 now = tcp_now(); 337 338 if (now > base) 339 return now - base; 340 341 return now + UINT_MAX - base; 342 } 343 344 345 static inline bool 346 state_needs_finish(int32 state) 347 { 348 return state == WAIT_FOR_FINISH_ACKNOWLEDGE 349 || state == FINISH_SENT || state == CLOSING; 350 } 351 352 353 // #pragma mark - 354 355 356 WaitList::WaitList(const char* name) 357 { 358 fCondition = 0; 359 fSem = create_sem(0, name); 360 } 361 362 363 WaitList::~WaitList() 364 { 365 delete_sem(fSem); 366 } 367 368 369 status_t 370 WaitList::InitCheck() const 371 { 372 return fSem; 373 } 374 375 376 status_t 377 WaitList::Wait(MutexLocker& locker, bigtime_t timeout) 378 { 379 locker.Unlock(); 380 381 status_t status = B_OK; 382 383 while (!atomic_test_and_set(&fCondition, 0, 1)) { 384 status = acquire_sem_etc(fSem, 1, B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT, 385 timeout); 386 if (status != B_OK) 387 break; 388 } 389 390 locker.Lock(); 391 return status; 392 } 393 394 395 void 396 WaitList::Signal() 397 { 398 atomic_or(&fCondition, 1); 399 #ifdef __HAIKU__ 400 release_sem_etc(fSem, 1, B_DO_NOT_RESCHEDULE | B_RELEASE_ALL); 401 #else 402 int32 count; 403 if (get_sem_count(fSem, &count) == B_OK && count < 0) 404 release_sem_etc(fSem, -count, B_DO_NOT_RESCHEDULE); 405 #endif 406 } 407 408 409 // #pragma mark - 410 411 412 TCPEndpoint::TCPEndpoint(net_socket* socket) 413 : 414 ProtocolSocket(socket), 415 fManager(NULL), 416 fReceiveList("tcp receive"), 417 fSendList("tcp send"), 418 fOptions(0), 419 fSendWindowShift(0), 420 fReceiveWindowShift(0), 421 fSendUnacknowledged(0), 422 fSendNext(0), 423 fSendMax(0), 424 fSendUrgentOffset(0), 425 fSendWindow(0), 426 fSendMaxWindow(0), 427 fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE), 428 fSendQueue(socket->send.buffer_size), 429 fInitialSendSequence(0), 430 fDuplicateAcknowledgeCount(0), 431 fRoute(NULL), 432 fReceiveNext(0), 433 fReceiveMaxAdvertised(0), 434 fReceiveWindow(socket->receive.buffer_size), 435 fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE), 436 fReceiveQueue(socket->receive.buffer_size), 437 fRoundTripTime(TCP_INITIAL_RTT / kTimestampFactor), 438 fRoundTripDeviation(TCP_INITIAL_RTT / kTimestampFactor), 439 fRetransmitTimeout(TCP_INITIAL_RTT), 440 fReceivedTimestamp(0), 441 fCongestionWindow(0), 442 fSlowStartThreshold(0), 443 fState(CLOSED), 444 fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP) 445 { 446 // TODO: to be replaced with a real read/write locking strategy! 447 mutex_init(&fLock, "tcp lock"); 448 449 gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this); 450 gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer, 451 this); 452 gStackModule->init_timer(&fDelayedAcknowledgeTimer, 453 TCPEndpoint::_DelayedAcknowledgeTimer, this); 454 gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer, 455 this); 456 } 457 458 459 TCPEndpoint::~TCPEndpoint() 460 { 461 mutex_lock(&fLock); 462 463 _CancelConnectionTimers(); 464 gStackModule->cancel_timer(&fTimeWaitTimer); 465 466 if (fManager != NULL) { 467 fManager->Unbind(this); 468 put_endpoint_manager(fManager); 469 } 470 471 mutex_destroy(&fLock); 472 473 // we need to wait for all timers to return 474 gStackModule->wait_for_timer(&fRetransmitTimer); 475 gStackModule->wait_for_timer(&fPersistTimer); 476 gStackModule->wait_for_timer(&fDelayedAcknowledgeTimer); 477 gStackModule->wait_for_timer(&fTimeWaitTimer); 478 479 gDatalinkModule->put_route(Domain(), fRoute); 480 } 481 482 483 status_t 484 TCPEndpoint::InitCheck() const 485 { 486 if (fReceiveList.InitCheck() < B_OK) 487 return fReceiveList.InitCheck(); 488 489 if (fSendList.InitCheck() < B_OK) 490 return fSendList.InitCheck(); 491 492 return B_OK; 493 } 494 495 496 // #pragma mark - protocol API 497 498 499 status_t 500 TCPEndpoint::Open() 501 { 502 TRACE("Open()"); 503 504 status_t status = ProtocolSocket::Open(); 505 if (status < B_OK) 506 return status; 507 508 fManager = get_endpoint_manager(Domain()); 509 if (fManager == NULL) 510 return EAFNOSUPPORT; 511 512 return B_OK; 513 } 514 515 516 status_t 517 TCPEndpoint::Close() 518 { 519 TRACE("Close()"); 520 521 MutexLocker locker(fLock); 522 523 if (fState == LISTEN) 524 delete_sem(fAcceptSemaphore); 525 526 if (fState == SYNCHRONIZE_SENT || fState == LISTEN) { 527 // TODO: what about linger in case of SYNCHRONIZE_SENT? 528 fState = CLOSED; 529 T(State(this)); 530 return B_OK; 531 } 532 533 status_t status = _Disconnect(true); 534 if (status != B_OK) 535 return status; 536 537 if (socket->options & SO_LINGER) { 538 TRACE("Close(): Lingering for %i secs", socket->linger); 539 540 bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL); 541 542 while (fSendQueue.Used() > 0) { 543 status = fSendList.Wait(locker, maximum); 544 if (status == B_TIMED_OUT || status == B_WOULD_BLOCK) 545 break; 546 else if (status < B_OK) 547 return status; 548 } 549 550 TRACE("Close(): after waiting, the SendQ was left with %lu bytes.", 551 fSendQueue.Used()); 552 } 553 return B_OK; 554 } 555 556 557 void 558 TCPEndpoint::Free() 559 { 560 TRACE("Free()"); 561 562 MutexLocker _(fLock); 563 564 if (fState <= SYNCHRONIZE_SENT) 565 return; 566 567 // we are only interested in the timer, not in changing state 568 _EnterTimeWait(); 569 570 fFlags |= FLAG_CLOSED; 571 if ((fFlags & FLAG_DELETE_ON_CLOSE) == 0) { 572 // we'll be freed later when the 2MSL timer expires 573 gSocketModule->acquire_socket(socket); 574 } 575 } 576 577 578 /*! Creates and sends a synchronize packet to /a address, and then waits 579 until the connection has been established or refused. 580 */ 581 status_t 582 TCPEndpoint::Connect(const sockaddr* address) 583 { 584 TRACE("Connect() on address %s", PrintAddress(address)); 585 586 if (!AddressModule()->is_same_family(address)) 587 return EAFNOSUPPORT; 588 589 MutexLocker locker(fLock); 590 591 if (gStackModule->is_restarted_syscall()) { 592 bigtime_t timeout = gStackModule->restore_syscall_restart_timeout(); 593 status_t status = _WaitForEstablished(locker, timeout); 594 TRACE(" Connect(): Connection complete: %s (timeout was %llu)", 595 strerror(status), timeout); 596 return posix_error(status); 597 } 598 599 // Can only call connect() from CLOSED or LISTEN states 600 // otherwise endpoint is considered already connected 601 if (fState == LISTEN) { 602 // this socket is about to connect; remove pending connections in the backlog 603 gSocketModule->set_max_backlog(socket, 0); 604 } else if (fState == ESTABLISHED) { 605 return EISCONN; 606 } else if (fState != CLOSED) 607 return EINPROGRESS; 608 609 // consider destination address INADDR_ANY as INADDR_LOOPBACK 610 sockaddr_storage _address; 611 if (AddressModule()->is_empty_address(address, false)) { 612 AddressModule()->get_loopback_address((sockaddr *)&_address); 613 // for IPv4 and IPv6 the port is at the same offset 614 ((sockaddr_in &)_address).sin_port = ((sockaddr_in *)address)->sin_port; 615 address = (sockaddr *)&_address; 616 } 617 618 status_t status = _PrepareSendPath(address); 619 if (status < B_OK) 620 return status; 621 622 TRACE(" Connect(): starting 3-way handshake..."); 623 624 fState = SYNCHRONIZE_SENT; 625 T(State(this)); 626 627 // send SYN 628 status = _SendQueued(); 629 if (status != B_OK) { 630 _Close(); 631 return status; 632 } 633 634 // If we are running over Loopback, after _SendQueued() returns we 635 // may be in ESTABLISHED already. 636 if (fState == ESTABLISHED) { 637 TRACE(" Connect() completed after _SendQueued()"); 638 return B_OK; 639 } 640 641 // wait until 3-way handshake is complete (if needed) 642 bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT); 643 if (timeout == 0) { 644 // we're a non-blocking socket 645 TRACE(" Connect() delayed, return EINPROGRESS"); 646 return EINPROGRESS; 647 } 648 649 bigtime_t absoluteTimeout = absolute_timeout(timeout); 650 gStackModule->store_syscall_restart_timeout(absoluteTimeout); 651 652 status = _WaitForEstablished(locker, absoluteTimeout); 653 TRACE(" Connect(): Connection complete: %s (timeout was %llu)", 654 strerror(status), timeout); 655 return posix_error(status); 656 } 657 658 659 status_t 660 TCPEndpoint::Accept(struct net_socket** _acceptedSocket) 661 { 662 TRACE("Accept()"); 663 664 MutexLocker locker(fLock); 665 666 status_t status; 667 bigtime_t timeout = absolute_timeout(socket->receive.timeout); 668 if (gStackModule->is_restarted_syscall()) 669 timeout = gStackModule->restore_syscall_restart_timeout(); 670 else 671 gStackModule->store_syscall_restart_timeout(timeout); 672 673 do { 674 locker.Unlock(); 675 676 status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT 677 | B_CAN_INTERRUPT, timeout); 678 if (status != B_OK) { 679 if (status == B_TIMED_OUT && socket->receive.timeout == 0) 680 return B_WOULD_BLOCK; 681 682 return status; 683 } 684 685 locker.Lock(); 686 status = gSocketModule->dequeue_connected(socket, _acceptedSocket); 687 #ifdef TRACE_TCP 688 if (status == B_OK) 689 TRACE(" Accept() returning %p", (*_acceptedSocket)->first_protocol); 690 #endif 691 } while (status != B_OK); 692 693 return status; 694 } 695 696 697 status_t 698 TCPEndpoint::Bind(const sockaddr *address) 699 { 700 if (address == NULL) 701 return B_BAD_VALUE; 702 703 MutexLocker lock(fLock); 704 705 TRACE("Bind() on address %s", PrintAddress(address)); 706 707 if (fState != CLOSED) 708 return EISCONN; 709 710 return fManager->Bind(this, address); 711 } 712 713 714 status_t 715 TCPEndpoint::Unbind(struct sockaddr *address) 716 { 717 TRACE("Unbind()"); 718 719 MutexLocker _(fLock); 720 return fManager->Unbind(this); 721 } 722 723 724 status_t 725 TCPEndpoint::Listen(int count) 726 { 727 TRACE("Listen()"); 728 729 MutexLocker _(fLock); 730 731 if (fState != CLOSED && fState != LISTEN) 732 return B_BAD_VALUE; 733 734 if (fState == CLOSED) { 735 fAcceptSemaphore = create_sem(0, "tcp accept"); 736 if (fAcceptSemaphore < B_OK) 737 return ENOBUFS; 738 739 status_t status = fManager->SetPassive(this); 740 if (status != B_OK) { 741 delete_sem(fAcceptSemaphore); 742 fAcceptSemaphore = -1; 743 return status; 744 } 745 } 746 747 gSocketModule->set_max_backlog(socket, count); 748 749 fState = LISTEN; 750 T(State(this)); 751 return B_OK; 752 } 753 754 755 status_t 756 TCPEndpoint::Shutdown(int direction) 757 { 758 TRACE("Shutdown(%i)", direction); 759 760 MutexLocker lock(fLock); 761 762 if (direction == SHUT_RD || direction == SHUT_RDWR) 763 fFlags |= FLAG_NO_RECEIVE; 764 765 if (direction == SHUT_WR || direction == SHUT_RDWR) { 766 // TODO: That's not correct. After read/write shutting down the socket 767 // one should still be able to read previously arrived data. 768 _Disconnect(false); 769 } 770 771 return B_OK; 772 } 773 774 775 /*! Puts data contained in \a buffer into send buffer */ 776 status_t 777 TCPEndpoint::SendData(net_buffer *buffer) 778 { 779 MutexLocker lock(fLock); 780 781 TRACE("SendData(buffer %p, size %lu, flags %lx) [total %lu bytes, has %lu]", 782 buffer, buffer->size, buffer->flags, fSendQueue.Size(), 783 fSendQueue.Free()); 784 785 if (fState == CLOSED) 786 return ENOTCONN; 787 if (fState == LISTEN) 788 return EDESTADDRREQ; 789 if (!is_writable(fState)) { 790 // we only send signals when called from userland 791 if (gStackModule->is_syscall()) 792 send_signal(find_thread(NULL), SIGPIPE); 793 return EPIPE; 794 } 795 796 uint32 flags = buffer->flags; 797 size_t left = buffer->size; 798 799 bigtime_t timeout = absolute_timeout(socket->send.timeout); 800 if (gStackModule->is_restarted_syscall()) 801 timeout = gStackModule->restore_syscall_restart_timeout(); 802 else 803 gStackModule->store_syscall_restart_timeout(timeout); 804 805 while (left > 0) { 806 while (fSendQueue.Free() < socket->send.low_water_mark) { 807 // wait until enough space is available 808 status_t status = fSendList.Wait(lock, timeout); 809 if (status < B_OK) { 810 TRACE(" SendData() returning %s (%d)", 811 strerror(posix_error(status)), (int)posix_error(status)); 812 return posix_error(status); 813 } 814 815 if (!is_writable(fState)) { 816 // we only send signals when called from userland 817 if (gStackModule->is_syscall()) 818 send_signal(find_thread(NULL), SIGPIPE); 819 return EPIPE; 820 } 821 } 822 823 size_t size = fSendQueue.Free(); 824 if (size < left) { 825 // we need to split the original buffer 826 net_buffer* clone = gBufferModule->clone(buffer, false); 827 // TODO: add offset/size parameter to net_buffer::clone() or 828 // even a move_data() function, as this is a bit inefficient 829 if (clone == NULL) 830 return ENOBUFS; 831 832 status_t status = gBufferModule->trim(clone, size); 833 if (status != B_OK) { 834 gBufferModule->free(clone); 835 return status; 836 } 837 838 gBufferModule->remove_header(buffer, size); 839 left -= size; 840 fSendQueue.Add(clone); 841 } else { 842 left -= buffer->size; 843 fSendQueue.Add(buffer); 844 } 845 } 846 847 TRACE(" SendData(): %lu bytes used.", fSendQueue.Used()); 848 849 bool force = false; 850 if ((flags & MSG_OOB) != 0) { 851 fSendUrgentOffset = fSendQueue.LastSequence(); 852 // RFC 961 specifies that the urgent offset points to the last 853 // byte of urgent data. However, this is commonly implemented as 854 // here, ie. it points to the first byte after the urgent data. 855 force = true; 856 } 857 if ((flags & MSG_EOF) != 0) 858 _Disconnect(false); 859 860 if (fState == ESTABLISHED || fState == FINISH_RECEIVED) 861 _SendQueued(force); 862 863 return B_OK; 864 } 865 866 867 ssize_t 868 TCPEndpoint::SendAvailable() 869 { 870 MutexLocker locker(fLock); 871 872 ssize_t available; 873 874 if (is_writable(fState)) 875 available = fSendQueue.Free(); 876 else 877 available = EPIPE; 878 879 TRACE("SendAvailable(): %li", available); 880 return available; 881 } 882 883 884 status_t 885 TCPEndpoint::FillStat(net_stat *stat) 886 { 887 MutexLocker _(fLock); 888 889 strlcpy(stat->state, name_for_state(fState), sizeof(stat->state)); 890 stat->receive_queue_size = fReceiveQueue.Available(); 891 stat->send_queue_size = fSendQueue.Used(); 892 893 return B_OK; 894 } 895 896 897 status_t 898 TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer) 899 { 900 TRACE("ReadData(%lu bytes, flags 0x%x)", numBytes, (unsigned int)flags); 901 902 MutexLocker locker(fLock); 903 904 *_buffer = NULL; 905 906 if (fState == CLOSED) 907 return ENOTCONN; 908 909 bigtime_t timeout = absolute_timeout(socket->receive.timeout); 910 if (gStackModule->is_restarted_syscall()) 911 timeout = gStackModule->restore_syscall_restart_timeout(); 912 else 913 gStackModule->store_syscall_restart_timeout(timeout); 914 915 if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) { 916 if (flags & MSG_DONTWAIT) 917 return B_WOULD_BLOCK; 918 919 status_t status = _WaitForEstablished(locker, timeout); 920 if (status < B_OK) 921 return posix_error(status); 922 } 923 924 size_t dataNeeded = socket->receive.low_water_mark; 925 926 // When MSG_WAITALL is set then the function should block 927 // until the full amount of data can be returned. 928 if (flags & MSG_WAITALL) 929 dataNeeded = numBytes; 930 931 // TODO: add support for urgent data (MSG_OOB) 932 933 while (true) { 934 if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE 935 || fState == TIME_WAIT) { 936 // ``Connection closing''. 937 return B_OK; 938 } 939 940 if (fReceiveQueue.Available() > 0) { 941 if (fReceiveQueue.Available() >= dataNeeded 942 || (fReceiveQueue.PushedData() > 0 943 && fReceiveQueue.PushedData() >= fReceiveQueue.Available())) 944 break; 945 } else if (fState == FINISH_RECEIVED) { 946 // ``If no text is awaiting delivery, the RECEIVE will 947 // get a Connection closing''. 948 return B_OK; 949 } 950 951 if ((flags & MSG_DONTWAIT) != 0 || socket->receive.timeout == 0) 952 return B_WOULD_BLOCK; 953 954 if ((fFlags & FLAG_NO_RECEIVE) != 0) 955 return B_OK; 956 957 status_t status = fReceiveList.Wait(locker, timeout); 958 if (status < B_OK) { 959 // The Open Group base specification mentions that EINTR should be 960 // returned if the recv() is interrupted before _any data_ is 961 // available. So we actually check if there is data, and if so, 962 // push it to the user. 963 if ((status == B_TIMED_OUT || status == B_INTERRUPTED) 964 && fReceiveQueue.Available() > 0) 965 break; 966 967 return posix_error(status); 968 } 969 } 970 971 TRACE(" ReadData(): %lu are available.", fReceiveQueue.Available()); 972 973 if (numBytes < fReceiveQueue.Available()) 974 fReceiveList.Signal(); 975 976 bool clone = (flags & MSG_PEEK) != 0; 977 978 ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer); 979 980 TRACE(" ReadData(): %lu bytes kept.", fReceiveQueue.Available()); 981 982 // if we are opening the window, check if we should send an ACK 983 if (!clone) 984 SendAcknowledge(false); 985 986 return receivedBytes; 987 } 988 989 990 ssize_t 991 TCPEndpoint::ReadAvailable() 992 { 993 MutexLocker locker(fLock); 994 995 TRACE("ReadAvailable(): %li", _AvailableData()); 996 997 return _AvailableData(); 998 } 999 1000 1001 status_t 1002 TCPEndpoint::SetSendBufferSize(size_t length) 1003 { 1004 MutexLocker _(fLock); 1005 fSendQueue.SetMaxBytes(length); 1006 return B_OK; 1007 } 1008 1009 1010 status_t 1011 TCPEndpoint::SetReceiveBufferSize(size_t length) 1012 { 1013 MutexLocker _(fLock); 1014 fReceiveQueue.SetMaxBytes(length); 1015 return B_OK; 1016 } 1017 1018 1019 status_t 1020 TCPEndpoint::GetOption(int option, void* _value, int* _length) 1021 { 1022 if (*_length != sizeof(int)) 1023 return B_BAD_VALUE; 1024 1025 int* value = (int*)_value; 1026 1027 switch (option) { 1028 case TCP_NODELAY: 1029 if ((fOptions & TCP_NODELAY) != 0) 1030 *value = 1; 1031 else 1032 *value = 0; 1033 return B_OK; 1034 1035 case TCP_MAXSEG: 1036 *value = fReceiveMaxSegmentSize; 1037 return B_OK; 1038 1039 default: 1040 return B_BAD_VALUE; 1041 } 1042 } 1043 1044 1045 status_t 1046 TCPEndpoint::SetOption(int option, const void* _value, int length) 1047 { 1048 if (option != TCP_NODELAY) 1049 return B_BAD_VALUE; 1050 1051 if (length != sizeof(int)) 1052 return B_BAD_VALUE; 1053 1054 const int* value = (const int*)_value; 1055 1056 MutexLocker _(fLock); 1057 if (*value) 1058 fOptions |= TCP_NODELAY; 1059 else 1060 fOptions &= ~TCP_NODELAY; 1061 1062 return B_OK; 1063 } 1064 1065 1066 // #pragma mark - misc 1067 1068 1069 bool 1070 TCPEndpoint::IsBound() const 1071 { 1072 return !LocalAddress().IsEmpty(true); 1073 } 1074 1075 1076 bool 1077 TCPEndpoint::IsLocal() const 1078 { 1079 return (fFlags & FLAG_LOCAL) != 0; 1080 } 1081 1082 1083 status_t 1084 TCPEndpoint::DelayedAcknowledge() 1085 { 1086 if (gStackModule->cancel_timer(&fDelayedAcknowledgeTimer)) { 1087 // timer was active, send an ACK now (with the exception above, 1088 // we send every other ACK) 1089 return SendAcknowledge(true); 1090 } 1091 1092 gStackModule->set_timer(&fDelayedAcknowledgeTimer, 1093 TCP_DELAYED_ACKNOWLEDGE_TIMEOUT); 1094 return B_OK; 1095 } 1096 1097 1098 status_t 1099 TCPEndpoint::SendAcknowledge(bool force) 1100 { 1101 return _SendQueued(force, 0); 1102 } 1103 1104 1105 void 1106 TCPEndpoint::_StartPersistTimer() 1107 { 1108 gStackModule->set_timer(&fPersistTimer, 1000000LL); 1109 } 1110 1111 1112 void 1113 TCPEndpoint::_EnterTimeWait() 1114 { 1115 TRACE("_EnterTimeWait()\n"); 1116 1117 _CancelConnectionTimers(); 1118 1119 if (fState == TIME_WAIT && IsLocal()) { 1120 // we do not use TIME_WAIT state for local connections 1121 fFlags |= FLAG_DELETE_ON_CLOSE; 1122 return; 1123 } 1124 1125 _UpdateTimeWait(); 1126 } 1127 1128 1129 void 1130 TCPEndpoint::_UpdateTimeWait() 1131 { 1132 gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1); 1133 } 1134 1135 1136 void 1137 TCPEndpoint::_CancelConnectionTimers() 1138 { 1139 gStackModule->cancel_timer(&fRetransmitTimer); 1140 gStackModule->cancel_timer(&fPersistTimer); 1141 gStackModule->cancel_timer(&fDelayedAcknowledgeTimer); 1142 } 1143 1144 1145 /*! Sends the FIN flag to the peer when the connection is still open. 1146 Moves the endpoint to the next state depending on where it was. 1147 */ 1148 status_t 1149 TCPEndpoint::_Disconnect(bool closing) 1150 { 1151 tcp_state previousState = fState; 1152 1153 if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED) 1154 fState = FINISH_SENT; 1155 else if (fState == FINISH_RECEIVED) 1156 fState = WAIT_FOR_FINISH_ACKNOWLEDGE; 1157 else 1158 return B_OK; 1159 1160 T(State(this)); 1161 1162 status_t status = _SendQueued(); 1163 if (status != B_OK) { 1164 fState = previousState; 1165 T(State(this)); 1166 return status; 1167 } 1168 1169 return B_OK; 1170 } 1171 1172 1173 void 1174 TCPEndpoint::_MarkEstablished() 1175 { 1176 fState = ESTABLISHED; 1177 T(State(this)); 1178 1179 if (gSocketModule->has_parent(socket)) { 1180 gSocketModule->set_connected(socket); 1181 release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE); 1182 } 1183 1184 fSendList.Signal(); 1185 } 1186 1187 1188 status_t 1189 TCPEndpoint::_WaitForEstablished(MutexLocker &locker, bigtime_t timeout) 1190 { 1191 // TODO: Checking for CLOSED seems correct, but breaks several neon tests. 1192 // When investigating this, also have a look at _Close() and _HandleReset(). 1193 while (fState < ESTABLISHED/* && fState != CLOSED*/) { 1194 if (socket->error != B_OK) 1195 return socket->error; 1196 1197 status_t status = fSendList.Wait(locker, timeout); 1198 if (status < B_OK) 1199 return status; 1200 } 1201 1202 return B_OK; 1203 } 1204 1205 1206 // #pragma mark - receive 1207 1208 1209 void 1210 TCPEndpoint::_Close() 1211 { 1212 _CancelConnectionTimers(); 1213 fState = CLOSED; 1214 T(State(this)); 1215 1216 fFlags |= FLAG_DELETE_ON_CLOSE; 1217 1218 fSendList.Signal(); 1219 _NotifyReader(); 1220 1221 if (gSocketModule->has_parent(socket)) { 1222 // We still have a parent - obviously, we haven't been accepted yet, 1223 // so no one could ever close us. 1224 _CancelConnectionTimers(); 1225 gSocketModule->set_aborted(socket); 1226 } 1227 } 1228 1229 1230 void 1231 TCPEndpoint::_HandleReset(status_t error) 1232 { 1233 socket->error = error; 1234 _Close(); 1235 1236 gSocketModule->notify(socket, B_SELECT_WRITE, error); 1237 gSocketModule->notify(socket, B_SELECT_ERROR, error); 1238 } 1239 1240 1241 void 1242 TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment) 1243 { 1244 if (++fDuplicateAcknowledgeCount < 3) 1245 return; 1246 1247 if (fDuplicateAcknowledgeCount == 3) { 1248 _ResetSlowStart(); 1249 fCongestionWindow = fSlowStartThreshold + 3 1250 * fSendMaxSegmentSize; 1251 fSendNext = segment.acknowledge; 1252 } else if (fDuplicateAcknowledgeCount > 3) 1253 fCongestionWindow += fSendMaxSegmentSize; 1254 1255 _SendQueued(); 1256 } 1257 1258 1259 void 1260 TCPEndpoint::_UpdateTimestamps(tcp_segment_header& segment, 1261 size_t segmentLength) 1262 { 1263 if (fFlags & FLAG_OPTION_TIMESTAMP) { 1264 tcp_sequence sequence(segment.sequence); 1265 1266 if (fLastAcknowledgeSent >= sequence 1267 && fLastAcknowledgeSent < (sequence + segmentLength)) 1268 fReceivedTimestamp = segment.timestamp_value; 1269 } 1270 } 1271 1272 1273 ssize_t 1274 TCPEndpoint::_AvailableData() const 1275 { 1276 // TODO: Refer to the FLAG_NO_RECEIVE comment above regarding 1277 // the application of FLAG_NO_RECEIVE in listen()ing 1278 // sockets. 1279 if (fState == LISTEN) 1280 return gSocketModule->count_connected(socket); 1281 if (fState == SYNCHRONIZE_SENT) 1282 return 0; 1283 1284 ssize_t availableData = fReceiveQueue.Available(); 1285 1286 if (availableData == 0 && !_ShouldReceive()) 1287 return ENOTCONN; 1288 1289 return availableData; 1290 } 1291 1292 1293 void 1294 TCPEndpoint::_NotifyReader() 1295 { 1296 fReceiveList.Signal(); 1297 gSocketModule->notify(socket, B_SELECT_READ, _AvailableData()); 1298 } 1299 1300 1301 bool 1302 TCPEndpoint::_AddData(tcp_segment_header& segment, net_buffer* buffer) 1303 { 1304 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1305 // Remember the position of the finish received flag 1306 fFinishReceived = true; 1307 fFinishReceivedAt = segment.sequence + buffer->size; 1308 } 1309 1310 fReceiveQueue.Add(buffer, segment.sequence); 1311 fReceiveNext = fReceiveQueue.NextSequence(); 1312 1313 if (fFinishReceived) { 1314 // Set or reset the finish flag on the current segment 1315 if (fReceiveNext < fFinishReceivedAt) 1316 segment.flags &= ~TCP_FLAG_FINISH; 1317 else 1318 segment.flags |= TCP_FLAG_FINISH; 1319 } 1320 1321 TRACE(" _AddData(): adding data, receive next = %lu. Now have %lu bytes.", 1322 fReceiveNext.Number(), fReceiveQueue.Available()); 1323 1324 if ((segment.flags & TCP_FLAG_PUSH) != 0) 1325 fReceiveQueue.SetPushPointer(); 1326 1327 return fReceiveQueue.Available() > 0; 1328 } 1329 1330 1331 void 1332 TCPEndpoint::_PrepareReceivePath(tcp_segment_header& segment) 1333 { 1334 fInitialReceiveSequence = segment.sequence; 1335 fFinishReceived = false; 1336 1337 // count the received SYN 1338 segment.sequence++; 1339 1340 fReceiveNext = segment.sequence; 1341 fReceiveQueue.SetInitialSequence(segment.sequence); 1342 1343 if ((fOptions & TCP_NOOPT) == 0) { 1344 if (segment.max_segment_size > 0) 1345 fSendMaxSegmentSize = segment.max_segment_size; 1346 1347 if (segment.options & TCP_HAS_WINDOW_SCALE) { 1348 fFlags |= FLAG_OPTION_WINDOW_SCALE; 1349 fSendWindowShift = segment.window_shift; 1350 } else { 1351 fFlags &= ~FLAG_OPTION_WINDOW_SCALE; 1352 fReceiveWindowShift = 0; 1353 } 1354 1355 if (segment.options & TCP_HAS_TIMESTAMPS) { 1356 fFlags |= FLAG_OPTION_TIMESTAMP; 1357 fReceivedTimestamp = segment.timestamp_value; 1358 } else 1359 fFlags &= ~FLAG_OPTION_TIMESTAMP; 1360 } 1361 1362 fCongestionWindow = 2 * fSendMaxSegmentSize; 1363 fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift; 1364 } 1365 1366 1367 bool 1368 TCPEndpoint::_ShouldReceive() const 1369 { 1370 if ((fFlags & FLAG_NO_RECEIVE) != 0) 1371 return false; 1372 1373 return fState == ESTABLISHED || fState == FINISH_SENT 1374 || fState == FINISH_ACKNOWLEDGED; 1375 } 1376 1377 1378 int32 1379 TCPEndpoint::_Spawn(TCPEndpoint* parent, tcp_segment_header& segment, 1380 net_buffer* buffer) 1381 { 1382 MutexLocker _(fLock); 1383 1384 // TODO error checking 1385 ProtocolSocket::Open(); 1386 1387 fState = SYNCHRONIZE_RECEIVED; 1388 T(Spawn(parent, this)); 1389 1390 fManager = parent->fManager; 1391 1392 LocalAddress().SetTo(buffer->destination); 1393 PeerAddress().SetTo(buffer->source); 1394 1395 TRACE("Spawn()"); 1396 1397 // TODO: proper error handling! 1398 if (fManager->BindChild(this) != B_OK) { 1399 T(Error(this, "binding failed", __LINE__)); 1400 return DROP; 1401 } 1402 if (_PrepareSendPath(*PeerAddress()) != B_OK) { 1403 T(Error(this, "prepare send faild", __LINE__)); 1404 return DROP; 1405 } 1406 1407 fOptions = parent->fOptions; 1408 fAcceptSemaphore = parent->fAcceptSemaphore; 1409 1410 _PrepareReceivePath(segment); 1411 1412 // send SYN+ACK 1413 if (_SendQueued() != B_OK) { 1414 T(Error(this, "sending failed", __LINE__)); 1415 return DROP; 1416 } 1417 1418 segment.flags &= ~TCP_FLAG_SYNCHRONIZE; 1419 // we handled this flag now, it must not be set for further processing 1420 1421 return _Receive(segment, buffer); 1422 } 1423 1424 1425 int32 1426 TCPEndpoint::_ListenReceive(tcp_segment_header& segment, net_buffer* buffer) 1427 { 1428 TRACE("ListenReceive()"); 1429 1430 // Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state, 1431 // but the error behaviour differs 1432 if (segment.flags & TCP_FLAG_RESET) 1433 return DROP; 1434 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) 1435 return DROP | RESET; 1436 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0) 1437 return DROP; 1438 1439 // TODO: drop broadcast/multicast 1440 1441 // spawn new endpoint for accept() 1442 net_socket* newSocket; 1443 if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK) { 1444 T(Error(this, "spawning failed", __LINE__)); 1445 return DROP; 1446 } 1447 1448 return ((TCPEndpoint *)newSocket->first_protocol)->_Spawn(this, 1449 segment, buffer); 1450 } 1451 1452 1453 int32 1454 TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment, 1455 net_buffer *buffer) 1456 { 1457 TRACE("_SynchronizeSentReceive()"); 1458 1459 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0 1460 && (fInitialSendSequence >= segment.acknowledge 1461 || fSendMax < segment.acknowledge)) 1462 return DROP | RESET; 1463 1464 if (segment.flags & TCP_FLAG_RESET) { 1465 _HandleReset(ECONNREFUSED); 1466 return DROP; 1467 } 1468 1469 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0) 1470 return DROP; 1471 1472 fSendUnacknowledged = segment.acknowledge; 1473 _PrepareReceivePath(segment); 1474 1475 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) { 1476 _MarkEstablished(); 1477 } else { 1478 // simultaneous open 1479 fState = SYNCHRONIZE_RECEIVED; 1480 T(State(this)); 1481 } 1482 1483 segment.flags &= ~TCP_FLAG_SYNCHRONIZE; 1484 // we handled this flag now, it must not be set for further processing 1485 1486 return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE; 1487 } 1488 1489 1490 int32 1491 TCPEndpoint::_Receive(tcp_segment_header& segment, net_buffer* buffer) 1492 { 1493 uint32 advertisedWindow = (uint32)segment.advertised_window 1494 << fSendWindowShift; 1495 size_t segmentLength = buffer->size; 1496 1497 // First, handle the most common case for uni-directional data transfer 1498 // (known as header prediction - the segment must not change the window, 1499 // and must be the expected sequence, and contain no control flags) 1500 1501 if (fState == ESTABLISHED 1502 && segment.AcknowledgeOnly() 1503 && fReceiveNext == segment.sequence 1504 && advertisedWindow > 0 && advertisedWindow == fSendWindow 1505 && fSendNext == fSendMax) { 1506 _UpdateTimestamps(segment, segmentLength); 1507 1508 if (segmentLength == 0) { 1509 // this is a pure acknowledge segment - we're on the sending end 1510 if (fSendUnacknowledged < segment.acknowledge 1511 && fSendMax >= segment.acknowledge) { 1512 _Acknowledged(segment); 1513 return DROP; 1514 } 1515 } else if (segment.acknowledge == fSendUnacknowledged 1516 && fReceiveQueue.IsContiguous() 1517 && fReceiveQueue.Free() >= segmentLength 1518 && (fFlags & FLAG_NO_RECEIVE) == 0) { 1519 if (_AddData(segment, buffer)) 1520 _NotifyReader(); 1521 1522 return KEEP | ((segment.flags & TCP_FLAG_PUSH) != 0 1523 ? IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE); 1524 } 1525 } 1526 1527 // The fast path was not applicable, so we continue with the standard 1528 // processing of the incoming segment 1529 1530 ASSERT(fState != SYNCHRONIZE_SENT && fState != LISTEN); 1531 1532 if (fState != CLOSED && fState != TIME_WAIT) { 1533 // Check sequence number 1534 if (!segment_in_sequence(segment, segmentLength, fReceiveNext, 1535 fReceiveWindow)) { 1536 TRACE(" Receive(): segment out of window, next: %lu wnd: %lu", 1537 fReceiveNext.Number(), fReceiveWindow); 1538 if ((segment.flags & TCP_FLAG_RESET) != 0) { 1539 // TODO: this doesn't look right - review! 1540 return DROP; 1541 } 1542 return DROP | IMMEDIATE_ACKNOWLEDGE; 1543 } 1544 } 1545 1546 if ((segment.flags & TCP_FLAG_RESET) != 0) { 1547 // Is this a valid reset? 1548 // We generally ignore resets in time wait state (see RFC 1337) 1549 if (fLastAcknowledgeSent <= segment.sequence 1550 && tcp_sequence(segment.sequence) < (fLastAcknowledgeSent 1551 + fReceiveWindow) 1552 && fState != TIME_WAIT) { 1553 status_t error; 1554 if (fState == SYNCHRONIZE_RECEIVED) 1555 error = ECONNREFUSED; 1556 else if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE) 1557 error = ENOTCONN; 1558 else 1559 error = ECONNRESET; 1560 1561 _HandleReset(error); 1562 } 1563 1564 return DROP; 1565 } 1566 1567 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0 1568 || (fState == SYNCHRONIZE_RECEIVED 1569 && (fInitialReceiveSequence > segment.sequence 1570 || ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0 1571 && (fSendUnacknowledged > segment.acknowledge 1572 || fSendMax < segment.acknowledge))))) { 1573 // reset the connection - either the initial SYN was faulty, or we 1574 // received a SYN within the data stream 1575 return DROP | RESET; 1576 } 1577 1578 // TODO: Check this! Why do we advertize a window outside of what we should 1579 // buffer? 1580 fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow); 1581 // the window must not shrink 1582 1583 // trim buffer to be within the receive window 1584 int32 drop = (int32)(fReceiveNext - segment.sequence).Number(); 1585 if (drop > 0) { 1586 if ((uint32)drop > buffer->size 1587 || ((uint32)drop == buffer->size 1588 && (segment.flags & TCP_FLAG_FINISH) == 0)) { 1589 // don't accidently remove a FIN we shouldn't remove 1590 segment.flags &= ~TCP_FLAG_FINISH; 1591 drop = buffer->size; 1592 } 1593 1594 // remove duplicate data at the start 1595 TRACE("* remove %ld bytes from the start", drop); 1596 gBufferModule->remove_header(buffer, drop); 1597 segment.sequence += drop; 1598 } 1599 1600 int32 action = KEEP; 1601 1602 drop = (int32)(segment.sequence + buffer->size 1603 - (fReceiveNext + fReceiveWindow)).Number(); 1604 if (drop > 0) { 1605 // remove data exceeding our window 1606 if ((uint32)drop >= buffer->size) { 1607 // if we can accept data, or the segment is not what we'd expect, 1608 // drop the segment (an immediate acknowledge is always triggered) 1609 if (fReceiveWindow != 0 || segment.sequence != fReceiveNext) 1610 return DROP | IMMEDIATE_ACKNOWLEDGE; 1611 1612 action |= IMMEDIATE_ACKNOWLEDGE; 1613 } 1614 1615 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1616 // we need to remove the finish, too, as part of the data 1617 drop--; 1618 } 1619 1620 segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH); 1621 TRACE("* remove %ld bytes from the end", drop); 1622 gBufferModule->remove_trailer(buffer, drop); 1623 } 1624 1625 #ifdef TRACE_TCP 1626 if (advertisedWindow > fSendWindow) { 1627 TRACE(" Receive(): Window update %lu -> %lu", fSendWindow, 1628 advertisedWindow); 1629 } 1630 #endif 1631 1632 fSendWindow = advertisedWindow; 1633 if (advertisedWindow > fSendMaxWindow) 1634 fSendMaxWindow = advertisedWindow; 1635 1636 // Then look at the acknowledgement for any updates 1637 1638 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) { 1639 // process acknowledged data 1640 if (fState == SYNCHRONIZE_RECEIVED) 1641 _MarkEstablished(); 1642 1643 if (fSendMax < segment.acknowledge) 1644 return DROP | IMMEDIATE_ACKNOWLEDGE; 1645 1646 if (segment.acknowledge < fSendUnacknowledged) { 1647 if (buffer->size == 0 && advertisedWindow == fSendWindow 1648 && (segment.flags & TCP_FLAG_FINISH) == 0) { 1649 TRACE("Receive(): duplicate ack!"); 1650 1651 _DuplicateAcknowledge(segment); 1652 } 1653 1654 return DROP; 1655 } else { 1656 // this segment acknowledges in flight data 1657 1658 if (fDuplicateAcknowledgeCount >= 3) { 1659 // deflate the window. 1660 fCongestionWindow = fSlowStartThreshold; 1661 } 1662 1663 fDuplicateAcknowledgeCount = 0; 1664 1665 if (fSendMax == segment.acknowledge) 1666 TRACE("Receive(): all inflight data ack'd!"); 1667 1668 if (segment.acknowledge > fSendQueue.LastSequence() 1669 && fState > ESTABLISHED) { 1670 TRACE("Receive(): FIN has been acknowledged!"); 1671 1672 switch (fState) { 1673 case FINISH_SENT: 1674 fState = FINISH_ACKNOWLEDGED; 1675 T(State(this)); 1676 break; 1677 case CLOSING: 1678 fState = TIME_WAIT; 1679 T(State(this)); 1680 _EnterTimeWait(); 1681 return DROP; 1682 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1683 _Close(); 1684 break; 1685 1686 default: 1687 break; 1688 } 1689 } 1690 1691 if (fState != CLOSED) 1692 _Acknowledged(segment); 1693 } 1694 } 1695 1696 if (segment.flags & TCP_FLAG_URGENT) { 1697 if (fState == ESTABLISHED || fState == FINISH_SENT 1698 || fState == FINISH_ACKNOWLEDGED) { 1699 // TODO: Handle urgent data: 1700 // - RCV.UP <- max(RCV.UP, SEG.UP) 1701 // - signal the user that urgent data is available (SIGURG) 1702 } 1703 } 1704 1705 bool notify = false; 1706 1707 // The buffer may be freed if its data is added to the queue, so cache 1708 // the size as we still need it later. 1709 uint32 bufferSize = buffer->size; 1710 1711 if ((bufferSize > 0 || (segment.flags & TCP_FLAG_FINISH) != 0) 1712 && _ShouldReceive()) 1713 notify = _AddData(segment, buffer); 1714 else { 1715 if ((fFlags & FLAG_NO_RECEIVE) != 0) 1716 fReceiveNext += buffer->size; 1717 1718 action = (action & ~KEEP) | DROP; 1719 } 1720 1721 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1722 segmentLength++; 1723 if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) { 1724 TRACE("Receive(): peer is finishing connection!"); 1725 fReceiveNext++; 1726 notify = true; 1727 1728 // FIN implies PUSH 1729 fReceiveQueue.SetPushPointer(); 1730 1731 // we'll reply immediately to the FIN if we are not 1732 // transitioning to TIME WAIT so we immediatly ACK it. 1733 action |= IMMEDIATE_ACKNOWLEDGE; 1734 1735 // other side is closing connection; change states 1736 switch (fState) { 1737 case ESTABLISHED: 1738 case SYNCHRONIZE_RECEIVED: 1739 fState = FINISH_RECEIVED; 1740 T(State(this)); 1741 break; 1742 case FINISH_SENT: 1743 // simultaneous close 1744 fState = CLOSING; 1745 T(State(this)); 1746 break; 1747 case FINISH_ACKNOWLEDGED: 1748 fState = TIME_WAIT; 1749 T(State(this)); 1750 _EnterTimeWait(); 1751 break; 1752 case TIME_WAIT: 1753 _UpdateTimeWait(); 1754 break; 1755 1756 default: 1757 break; 1758 } 1759 } 1760 } 1761 1762 if (notify) 1763 _NotifyReader(); 1764 1765 if (bufferSize > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0) 1766 action |= ACKNOWLEDGE; 1767 1768 _UpdateTimestamps(segment, segmentLength); 1769 1770 TRACE("Receive() Action %ld", action); 1771 1772 return action; 1773 } 1774 1775 1776 int32 1777 TCPEndpoint::SegmentReceived(tcp_segment_header& segment, net_buffer* buffer) 1778 { 1779 MutexLocker locker(fLock); 1780 1781 TRACE("SegmentReceived(): buffer %p (%lu bytes) address %s to %s\n" 1782 "\tflags 0x%x, seq %lu, ack %lu, wnd %lu", 1783 buffer, buffer->size, PrintAddress(buffer->source), 1784 PrintAddress(buffer->destination), segment.flags, segment.sequence, 1785 segment.acknowledge, 1786 (uint32)segment.advertised_window << fSendWindowShift); 1787 T(Receive(this, segment, 1788 (uint32)segment.advertised_window << fSendWindowShift, buffer)); 1789 int32 segmentAction = DROP; 1790 1791 switch (fState) { 1792 case LISTEN: 1793 segmentAction = _ListenReceive(segment, buffer); 1794 break; 1795 1796 case SYNCHRONIZE_SENT: 1797 segmentAction = _SynchronizeSentReceive(segment, buffer); 1798 break; 1799 1800 case SYNCHRONIZE_RECEIVED: 1801 case ESTABLISHED: 1802 case FINISH_RECEIVED: 1803 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1804 case FINISH_SENT: 1805 case FINISH_ACKNOWLEDGED: 1806 case CLOSING: 1807 case TIME_WAIT: 1808 case CLOSED: 1809 segmentAction = _Receive(segment, buffer); 1810 break; 1811 } 1812 1813 // process acknowledge action as asked for by the *Receive() method 1814 if (segmentAction & IMMEDIATE_ACKNOWLEDGE) 1815 SendAcknowledge(true); 1816 else if (segmentAction & ACKNOWLEDGE) 1817 DelayedAcknowledge(); 1818 1819 if ((fFlags & (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) 1820 == (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) { 1821 locker.Unlock(); 1822 gSocketModule->release_socket(socket); 1823 } 1824 1825 return segmentAction; 1826 } 1827 1828 1829 // #pragma mark - send 1830 1831 1832 inline uint8 1833 TCPEndpoint::_CurrentFlags() 1834 { 1835 // we don't set FLAG_FINISH here, instead we do it 1836 // conditionally below depending if we are sending 1837 // the last bytes of the send queue. 1838 1839 switch (fState) { 1840 case CLOSED: 1841 return TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE; 1842 1843 case SYNCHRONIZE_SENT: 1844 return TCP_FLAG_SYNCHRONIZE; 1845 case SYNCHRONIZE_RECEIVED: 1846 return TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE; 1847 1848 case ESTABLISHED: 1849 case FINISH_RECEIVED: 1850 case FINISH_ACKNOWLEDGED: 1851 case TIME_WAIT: 1852 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1853 case FINISH_SENT: 1854 case CLOSING: 1855 return TCP_FLAG_ACKNOWLEDGE; 1856 1857 default: 1858 return 0; 1859 } 1860 } 1861 1862 1863 inline bool 1864 TCPEndpoint::_ShouldSendSegment(tcp_segment_header& segment, uint32 length, 1865 uint32 segmentMaxSize, uint32 flightSize) 1866 { 1867 if (length > 0) { 1868 // Avoid the silly window syndrome - we only send a segment in case: 1869 // - we have a full segment to send, or 1870 // - we're at the end of our buffer queue, or 1871 // - the buffer is at least larger than half of the maximum send window, 1872 // or 1873 // - we're retransmitting data 1874 if (length == segmentMaxSize 1875 || (fOptions & TCP_NODELAY) != 0 1876 || tcp_sequence(fSendNext + length) == fSendQueue.LastSequence() 1877 || (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2)) 1878 return true; 1879 } 1880 1881 // check if we need to send a window update to the peer 1882 if (segment.advertised_window > 0) { 1883 // correct the window to take into account what already has been advertised 1884 uint32 window = (segment.advertised_window << fReceiveWindowShift) 1885 - (fReceiveMaxAdvertised - fReceiveNext).Number(); 1886 1887 // if we can advertise a window larger than twice the maximum segment 1888 // size, or half the maximum buffer size we send a window update 1889 if (window >= (fReceiveMaxSegmentSize << 1) 1890 || window >= (socket->receive.buffer_size >> 1)) 1891 return true; 1892 } 1893 1894 if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH 1895 | TCP_FLAG_RESET)) != 0) 1896 return true; 1897 1898 // We do have urgent data pending 1899 if (fSendUrgentOffset > fSendNext) 1900 return true; 1901 1902 // there is no reason to send a segment just now 1903 return false; 1904 } 1905 1906 1907 status_t 1908 TCPEndpoint::_SendQueued(bool force) 1909 { 1910 return _SendQueued(force, fSendWindow); 1911 } 1912 1913 1914 /*! Sends one or more TCP segments with the data waiting in the queue, or some 1915 specific flags that need to be sent. 1916 */ 1917 status_t 1918 TCPEndpoint::_SendQueued(bool force, uint32 sendWindow) 1919 { 1920 if (fRoute == NULL) 1921 return B_ERROR; 1922 1923 // in passive state? 1924 if (fState == LISTEN) 1925 return B_ERROR; 1926 1927 tcp_segment_header segment(_CurrentFlags()); 1928 1929 if ((fOptions & TCP_NOOPT) == 0) { 1930 if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0) { 1931 segment.options |= TCP_HAS_TIMESTAMPS; 1932 segment.timestamp_reply = fReceivedTimestamp; 1933 segment.timestamp_value = tcp_now(); 1934 } 1935 1936 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0 1937 && fSendNext == fInitialSendSequence) { 1938 // add connection establishment options 1939 segment.max_segment_size = fReceiveMaxSegmentSize; 1940 if (fFlags & FLAG_OPTION_WINDOW_SCALE) { 1941 segment.options |= TCP_HAS_WINDOW_SCALE; 1942 segment.window_shift = fReceiveWindowShift; 1943 } 1944 } 1945 } 1946 1947 size_t availableBytes = fReceiveQueue.Free(); 1948 if (fFlags & FLAG_OPTION_WINDOW_SCALE) 1949 segment.advertised_window = availableBytes >> fReceiveWindowShift; 1950 else 1951 segment.advertised_window = min_c(TCP_MAX_WINDOW, availableBytes); 1952 1953 segment.acknowledge = fReceiveNext.Number(); 1954 1955 // Process urgent data 1956 if (fSendUrgentOffset > fSendNext) { 1957 segment.flags |= TCP_FLAG_URGENT; 1958 segment.urgent_offset = (fSendUrgentOffset - fSendNext).Number(); 1959 } else { 1960 fSendUrgentOffset = fSendUnacknowledged.Number(); 1961 // Keep urgent offset updated, so that it doesn't reach into our 1962 // send window on overlap 1963 segment.urgent_offset = 0; 1964 } 1965 1966 if (fCongestionWindow > 0 && fCongestionWindow < sendWindow) 1967 sendWindow = fCongestionWindow; 1968 1969 // fSendUnacknowledged 1970 // | fSendNext fSendMax 1971 // | | | 1972 // v v v 1973 // ----------------------------------- 1974 // | effective window | 1975 // ----------------------------------- 1976 1977 // Flight size represents the window of data which is currently in the 1978 // ether. We should never send data such as the flight size becomes larger 1979 // than the effective window. Note however that the effective window may be 1980 // reduced (by congestion for instance), so at some point in time flight 1981 // size may be larger than the currently calculated window. 1982 1983 uint32 flightSize = (fSendMax - fSendUnacknowledged).Number(); 1984 uint32 consumedWindow = (fSendNext - fSendUnacknowledged).Number(); 1985 1986 if (consumedWindow > sendWindow) { 1987 sendWindow = 0; 1988 // TODO: enter persist state? try to get a window update. 1989 } else 1990 sendWindow -= consumedWindow; 1991 1992 if (force && sendWindow == 0 && fSendNext <= fSendQueue.LastSequence()) { 1993 // send one byte of data to ask for a window update 1994 // (triggered by the persist timer) 1995 sendWindow = 1; 1996 } 1997 1998 uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow); 1999 tcp_sequence previousSendNext = fSendNext; 2000 2001 do { 2002 uint32 segmentMaxSize = fSendMaxSegmentSize 2003 - tcp_options_length(segment); 2004 uint32 segmentLength = min_c(length, segmentMaxSize); 2005 2006 if (fSendNext + segmentLength == fSendQueue.LastSequence()) { 2007 if (state_needs_finish(fState)) 2008 segment.flags |= TCP_FLAG_FINISH; 2009 if (length > 0) 2010 segment.flags |= TCP_FLAG_PUSH; 2011 } 2012 2013 // Determine if we should really send this segment 2014 if (!force && !_ShouldSendSegment(segment, segmentLength, 2015 segmentMaxSize, flightSize)) { 2016 if (fSendQueue.Available() 2017 && !gStackModule->is_timer_active(&fPersistTimer) 2018 && !gStackModule->is_timer_active(&fRetransmitTimer)) 2019 _StartPersistTimer(); 2020 break; 2021 } 2022 2023 net_buffer *buffer = gBufferModule->create(256); 2024 if (buffer == NULL) 2025 return B_NO_MEMORY; 2026 2027 status_t status = B_OK; 2028 if (segmentLength > 0) 2029 status = fSendQueue.Get(buffer, fSendNext, segmentLength); 2030 if (status < B_OK) { 2031 gBufferModule->free(buffer); 2032 return status; 2033 } 2034 2035 LocalAddress().CopyTo(buffer->source); 2036 PeerAddress().CopyTo(buffer->destination); 2037 2038 uint32 size = buffer->size; 2039 segment.sequence = fSendNext.Number(); 2040 2041 TRACE("SendQueued(): buffer %p (%lu bytes) address %s to %s\n" 2042 "\tflags 0x%x, seq %lu, ack %lu, rwnd %hu, cwnd %lu, ssthresh %lu\n" 2043 "\tlen %lu first %lu last %lu", 2044 buffer, buffer->size, PrintAddress(buffer->source), 2045 PrintAddress(buffer->destination), segment.flags, segment.sequence, 2046 segment.acknowledge, segment.advertised_window, 2047 fCongestionWindow, fSlowStartThreshold, segmentLength, 2048 fSendQueue.FirstSequence().Number(), 2049 fSendQueue.LastSequence().Number()); 2050 T(Send(this, segment, buffer, fSendQueue.FirstSequence(), 2051 fSendQueue.LastSequence())); 2052 2053 PROBE(buffer, sendWindow); 2054 sendWindow -= buffer->size; 2055 2056 status = add_tcp_header(AddressModule(), segment, buffer); 2057 if (status != B_OK) { 2058 gBufferModule->free(buffer); 2059 return status; 2060 } 2061 2062 // Update send status - we need to do this before we send the data 2063 // for local connections as the answer is directly handled 2064 2065 if (segment.flags & TCP_FLAG_SYNCHRONIZE) { 2066 segment.options &= ~TCP_HAS_WINDOW_SCALE; 2067 segment.max_segment_size = 0; 2068 size++; 2069 } 2070 2071 if (segment.flags & TCP_FLAG_FINISH) 2072 size++; 2073 2074 uint32 sendMax = fSendMax.Number(); 2075 fSendNext += size; 2076 if (fSendMax < fSendNext) 2077 fSendMax = fSendNext; 2078 2079 fReceiveMaxAdvertised = fReceiveNext 2080 + ((uint32)segment.advertised_window << fReceiveWindowShift); 2081 2082 status = next->module->send_routed_data(next, fRoute, buffer); 2083 if (status < B_OK) { 2084 gBufferModule->free(buffer); 2085 2086 fSendNext = segment.sequence; 2087 fSendMax = sendMax; 2088 // restore send status 2089 return status; 2090 } 2091 2092 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) 2093 fLastAcknowledgeSent = segment.acknowledge; 2094 2095 length -= segmentLength; 2096 segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET 2097 | TCP_FLAG_FINISH); 2098 } while (length > 0); 2099 2100 // if we sent data from the beggining of the send queue, 2101 // start the retransmition timer 2102 if (previousSendNext == fSendUnacknowledged 2103 && fSendNext > previousSendNext) { 2104 TRACE(" SendQueue(): set retransmit timer with rto %llu", 2105 fRetransmitTimeout); 2106 2107 gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout); 2108 } 2109 2110 return B_OK; 2111 } 2112 2113 2114 int 2115 TCPEndpoint::_MaxSegmentSize(const sockaddr* address) const 2116 { 2117 return next->module->get_mtu(next, address) - sizeof(tcp_header); 2118 } 2119 2120 2121 status_t 2122 TCPEndpoint::_PrepareSendPath(const sockaddr* peer) 2123 { 2124 if (fRoute == NULL) { 2125 fRoute = gDatalinkModule->get_route(Domain(), peer); 2126 if (fRoute == NULL) 2127 return ENETUNREACH; 2128 2129 if ((fRoute->flags & RTF_LOCAL) != 0) 2130 fFlags |= FLAG_LOCAL; 2131 } 2132 2133 // make sure connection does not already exist 2134 status_t status = fManager->SetConnection(this, *LocalAddress(), peer, 2135 fRoute->interface_address->local); 2136 if (status < B_OK) 2137 return status; 2138 2139 fInitialSendSequence = system_time() >> 4; 2140 fSendNext = fInitialSendSequence; 2141 fSendUnacknowledged = fInitialSendSequence; 2142 fSendMax = fInitialSendSequence; 2143 fSendUrgentOffset = fInitialSendSequence; 2144 2145 // we are counting the SYN here 2146 fSendQueue.SetInitialSequence(fSendNext + 1); 2147 2148 fReceiveMaxSegmentSize = _MaxSegmentSize(peer); 2149 2150 // Compute the window shift we advertise to our peer - if it doesn't support 2151 // this option, this will be reset to 0 (when its SYN is received) 2152 fReceiveWindowShift = 0; 2153 while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT 2154 && (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) { 2155 fReceiveWindowShift++; 2156 } 2157 2158 return B_OK; 2159 } 2160 2161 2162 void 2163 TCPEndpoint::_Acknowledged(tcp_segment_header& segment) 2164 { 2165 size_t previouslyUsed = fSendQueue.Used(); 2166 2167 fSendQueue.RemoveUntil(segment.acknowledge); 2168 fSendUnacknowledged = segment.acknowledge; 2169 2170 if (fSendNext < fSendUnacknowledged) 2171 fSendNext = fSendUnacknowledged; 2172 2173 if (fSendUnacknowledged == fSendMax) 2174 gStackModule->cancel_timer(&fRetransmitTimer); 2175 2176 if (fSendQueue.Used() < previouslyUsed) { 2177 // this ACK acknowledged data 2178 2179 if (segment.options & TCP_HAS_TIMESTAMPS) 2180 _UpdateRoundTripTime(tcp_diff_timestamp(segment.timestamp_reply)); 2181 else { 2182 // TODO: Fallback to RFC 793 type estimation 2183 } 2184 2185 if (is_writable(fState)) { 2186 // notify threads waiting on the socket to become writable again 2187 fSendList.Signal(); 2188 gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Used()); 2189 } 2190 2191 if (fCongestionWindow < fSlowStartThreshold) 2192 fCongestionWindow += fSendMaxSegmentSize; 2193 } 2194 2195 if (fCongestionWindow >= fSlowStartThreshold) { 2196 uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize; 2197 2198 if (increment < fCongestionWindow) 2199 increment = 1; 2200 else 2201 increment /= fCongestionWindow; 2202 2203 fCongestionWindow += increment; 2204 } 2205 2206 // if there is data left to be send, send it now 2207 if (fSendQueue.Used() > 0) 2208 _SendQueued(); 2209 } 2210 2211 2212 void 2213 TCPEndpoint::_Retransmit() 2214 { 2215 TRACE("Retransmit()"); 2216 _ResetSlowStart(); 2217 fSendNext = fSendUnacknowledged; 2218 _SendQueued(); 2219 } 2220 2221 2222 void 2223 TCPEndpoint::_UpdateRoundTripTime(int32 roundTripTime) 2224 { 2225 int32 rtt = roundTripTime; 2226 2227 // "smooth" round trip time as per Van Jacobson 2228 rtt -= fRoundTripTime / 8; 2229 fRoundTripTime += rtt; 2230 if (rtt < 0) 2231 rtt = -rtt; 2232 rtt -= fRoundTripDeviation / 4; 2233 fRoundTripDeviation += rtt; 2234 2235 fRetransmitTimeout = ((fRoundTripTime / 4 + fRoundTripDeviation) / 2) 2236 * kTimestampFactor; 2237 2238 TRACE(" RTO is now %llu (after rtt %ldms)", fRetransmitTimeout, 2239 roundTripTime); 2240 } 2241 2242 2243 void 2244 TCPEndpoint::_ResetSlowStart() 2245 { 2246 fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged).Number() / 2, 2247 2 * fSendMaxSegmentSize); 2248 fCongestionWindow = fSendMaxSegmentSize; 2249 } 2250 2251 2252 // #pragma mark - timer 2253 2254 2255 /*static*/ void 2256 TCPEndpoint::_RetransmitTimer(net_timer* timer, void* _endpoint) 2257 { 2258 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2259 T(Timer(endpoint, "retransmit")); 2260 2261 MutexLocker locker(endpoint->fLock); 2262 if (!locker.IsLocked()) 2263 return; 2264 2265 endpoint->_Retransmit(); 2266 } 2267 2268 2269 /*static*/ void 2270 TCPEndpoint::_PersistTimer(net_timer* timer, void* _endpoint) 2271 { 2272 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2273 T(Timer(endpoint, "persist")); 2274 2275 MutexLocker locker(endpoint->fLock); 2276 if (!locker.IsLocked()) 2277 return; 2278 2279 // the timer might not have been canceled early enough 2280 if (endpoint->State() == CLOSED) 2281 return; 2282 2283 endpoint->_SendQueued(true); 2284 } 2285 2286 2287 /*static*/ void 2288 TCPEndpoint::_DelayedAcknowledgeTimer(net_timer* timer, void* _endpoint) 2289 { 2290 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2291 T(Timer(endpoint, "delayed ack")); 2292 2293 MutexLocker locker(endpoint->fLock); 2294 if (!locker.IsLocked()) 2295 return; 2296 2297 // the timer might not have been canceled early enough 2298 if (endpoint->State() == CLOSED) 2299 return; 2300 2301 endpoint->SendAcknowledge(true); 2302 } 2303 2304 2305 /*static*/ void 2306 TCPEndpoint::_TimeWaitTimer(net_timer* timer, void* _endpoint) 2307 { 2308 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2309 T(Timer(endpoint, "time-wait")); 2310 2311 MutexLocker locker(endpoint->fLock); 2312 if (!locker.IsLocked()) 2313 return; 2314 2315 if ((endpoint->fFlags & FLAG_CLOSED) == 0) { 2316 endpoint->fFlags |= FLAG_DELETE_ON_CLOSE; 2317 return; 2318 } 2319 2320 locker.Unlock(); 2321 2322 gSocketModule->release_socket(endpoint->socket); 2323 } 2324 2325 2326 // #pragma mark - 2327 2328 2329 void 2330 TCPEndpoint::Dump() const 2331 { 2332 kprintf("TCP endpoint %p\n", this); 2333 kprintf(" state: %s\n", name_for_state(fState)); 2334 kprintf(" flags: 0x%" B_PRIx32 "\n", fFlags); 2335 #if KDEBUG 2336 kprintf(" lock: { %p, holder: %" B_PRId32 " }\n", &fLock, fLock.holder); 2337 #endif 2338 kprintf(" accept sem: %" B_PRId32 "\n", fAcceptSemaphore); 2339 kprintf(" options: 0x%" B_PRIx32 "\n", (uint32)fOptions); 2340 kprintf(" send\n"); 2341 kprintf(" window shift: %u\n", fSendWindowShift); 2342 kprintf(" unacknowledged: %" B_PRIu32 "\n", 2343 fSendUnacknowledged.Number()); 2344 kprintf(" next: %" B_PRIu32 "\n", fSendNext.Number()); 2345 kprintf(" max: %" B_PRIu32 "\n", fSendMax.Number()); 2346 kprintf(" urgent offset: %" B_PRIu32 "\n", fSendUrgentOffset.Number()); 2347 kprintf(" window: %" B_PRIu32 "\n", fSendWindow); 2348 kprintf(" max window: %" B_PRIu32 "\n", fSendMaxWindow); 2349 kprintf(" max segment size: %" B_PRIu32 "\n", fSendMaxSegmentSize); 2350 kprintf(" queue: %lu / %lu\n", fSendQueue.Used(), fSendQueue.Size()); 2351 #if DEBUG_BUFFER_QUEUE 2352 fSendQueue.Dump(); 2353 #endif 2354 kprintf(" last acknowledge sent: %" B_PRIu32 "\n", 2355 fLastAcknowledgeSent.Number()); 2356 kprintf(" initial sequence: %" B_PRIu32 "\n", 2357 fInitialSendSequence.Number()); 2358 kprintf(" receive\n"); 2359 kprintf(" window shift: %u\n", fReceiveWindowShift); 2360 kprintf(" next: %" B_PRIu32 "\n", fReceiveNext.Number()); 2361 kprintf(" max advertised: %" B_PRIu32 "\n", 2362 fReceiveMaxAdvertised.Number()); 2363 kprintf(" window: %" B_PRIu32 "\n", fReceiveWindow); 2364 kprintf(" max segment size: %" B_PRIu32 "\n", fReceiveMaxSegmentSize); 2365 kprintf(" queue: %lu / %lu\n", fReceiveQueue.Available(), 2366 fReceiveQueue.Size()); 2367 #if DEBUG_BUFFER_QUEUE 2368 fReceiveQueue.Dump(); 2369 #endif 2370 kprintf(" initial sequence: %" B_PRIu32 "\n", 2371 fInitialReceiveSequence.Number()); 2372 kprintf(" duplicate acknowledge count: %" B_PRIu32 "\n", 2373 fDuplicateAcknowledgeCount); 2374 kprintf(" round trip time: %" B_PRId32 " (deviation %" B_PRId32 ")\n", 2375 fRoundTripTime, fRoundTripDeviation); 2376 kprintf(" retransmit timeout: %" B_PRId64 "\n", fRetransmitTimeout); 2377 kprintf(" congestion window: %" B_PRIu32 "\n", fCongestionWindow); 2378 kprintf(" slow start threshold: %" B_PRIu32 "\n", fSlowStartThreshold); 2379 } 2380 2381