1 /* 2 * Copyright 2006-2010, Haiku, Inc. All Rights Reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Andrew Galante, haiku.galante@gmail.com 7 * Axel Dörfler, axeld@pinc-software.de 8 * Hugo Santos, hugosantos@gmail.com 9 */ 10 11 12 #include "TCPEndpoint.h" 13 14 #include <netinet/in.h> 15 #include <netinet/ip.h> 16 #include <netinet/tcp.h> 17 #include <new> 18 #include <signal.h> 19 #include <stdlib.h> 20 #include <string.h> 21 #include <stdint.h> 22 23 #include <KernelExport.h> 24 #include <Select.h> 25 26 #include <net_buffer.h> 27 #include <net_datalink.h> 28 #include <net_stat.h> 29 #include <NetBufferUtilities.h> 30 #include <NetUtilities.h> 31 32 #include <lock.h> 33 #include <tracing.h> 34 #include <util/AutoLock.h> 35 #include <util/list.h> 36 37 #include "EndpointManager.h" 38 39 40 // References: 41 // - RFC 793 - Transmission Control Protocol 42 // - RFC 813 - Window and Acknowledgement Strategy in TCP 43 // - RFC 1337 - TIME_WAIT Assassination Hazards in TCP 44 // 45 // Things incomplete in this implementation: 46 // - TCP Extensions for High Performance, RFC 1323 - RTTM, PAWS 47 // - Congestion Control, RFC 5681 48 // - Limited Transit, RFC 3042 49 // - SACK, Selective Acknowledgment; RFC 2018, RFC 2883, RFC 6675 50 // - NewReno Modification to TCP's Fast Recovery, RFC 2582 51 // 52 // Things this implementation currently doesn't implement: 53 // - Explicit Congestion Notification (ECN), RFC 3168 54 // - SYN-Cache 55 // - Forward RTO-Recovery, RFC 4138 56 // - Time-Wait hash instead of keeping sockets alive 57 58 #define PrintAddress(address) \ 59 AddressString(Domain(), address, true).Data() 60 61 //#define TRACE_TCP 62 //#define PROBE_TCP 63 64 #ifdef TRACE_TCP 65 // the space before ', ##args' is important in order for this to work with cpp 2.95 66 # define TRACE(format, args...) dprintf("%" B_PRId32 ": TCP [%" \ 67 B_PRIdBIGTIME "] %p (%12s) " format "\n", find_thread(NULL), \ 68 system_time(), this, name_for_state(fState) , ##args) 69 #else 70 # define TRACE(args...) do { } while (0) 71 #endif 72 73 #ifdef PROBE_TCP 74 # define PROBE(buffer, window) \ 75 dprintf("TCP PROBE %" B_PRIdBIGTIME " %s %s %" B_PRIu32 " snxt %" B_PRIu32 \ 76 " suna %" B_PRIu32 " cw %" B_PRIu32 " sst %" B_PRIu32 " win %" \ 77 B_PRIu32 " swin %" B_PRIu32 " smax-suna %" B_PRIu32 " savail %" \ 78 B_PRIuSIZE " sqused %" B_PRIuSIZE " rto %" B_PRIdBIGTIME "\n", \ 79 system_time(), PrintAddress(buffer->source), \ 80 PrintAddress(buffer->destination), buffer->size, fSendNext.Number(), \ 81 fSendUnacknowledged.Number(), fCongestionWindow, fSlowStartThreshold, \ 82 window, fSendWindow, (fSendMax - fSendUnacknowledged).Number(), \ 83 fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout) 84 #else 85 # define PROBE(buffer, window) do { } while (0) 86 #endif 87 88 #if TCP_TRACING 89 namespace TCPTracing { 90 91 class Receive : public AbstractTraceEntry { 92 public: 93 Receive(TCPEndpoint* endpoint, tcp_segment_header& segment, uint32 window, 94 net_buffer* buffer) 95 : 96 fEndpoint(endpoint), 97 fBuffer(buffer), 98 fBufferSize(buffer->size), 99 fSequence(segment.sequence), 100 fAcknowledge(segment.acknowledge), 101 fWindow(window), 102 fState(endpoint->State()), 103 fFlags(segment.flags) 104 { 105 Initialized(); 106 } 107 108 virtual void AddDump(TraceOutput& out) 109 { 110 out.Print("tcp:%p (%12s) receive buffer %p (%" B_PRIu32 " bytes), " 111 "flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32 112 ", wnd %" B_PRIu32, fEndpoint, name_for_state(fState), fBuffer, 113 fBufferSize, fFlags, fSequence, fAcknowledge, fWindow); 114 } 115 116 protected: 117 TCPEndpoint* fEndpoint; 118 net_buffer* fBuffer; 119 uint32 fBufferSize; 120 uint32 fSequence; 121 uint32 fAcknowledge; 122 uint32 fWindow; 123 tcp_state fState; 124 uint8 fFlags; 125 }; 126 127 class Send : public AbstractTraceEntry { 128 public: 129 Send(TCPEndpoint* endpoint, tcp_segment_header& segment, net_buffer* buffer, 130 tcp_sequence firstSequence, tcp_sequence lastSequence) 131 : 132 fEndpoint(endpoint), 133 fBuffer(buffer), 134 fBufferSize(buffer->size), 135 fSequence(segment.sequence), 136 fAcknowledge(segment.acknowledge), 137 fFirstSequence(firstSequence.Number()), 138 fLastSequence(lastSequence.Number()), 139 fState(endpoint->State()), 140 fFlags(segment.flags) 141 { 142 Initialized(); 143 } 144 145 virtual void AddDump(TraceOutput& out) 146 { 147 out.Print("tcp:%p (%12s) send buffer %p (%" B_PRIu32 " bytes), " 148 "flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32 149 ", first %" B_PRIu32 ", last %" B_PRIu32, fEndpoint, 150 name_for_state(fState), fBuffer, fBufferSize, fFlags, fSequence, 151 fAcknowledge, fFirstSequence, fLastSequence); 152 } 153 154 protected: 155 TCPEndpoint* fEndpoint; 156 net_buffer* fBuffer; 157 uint32 fBufferSize; 158 uint32 fSequence; 159 uint32 fAcknowledge; 160 uint32 fFirstSequence; 161 uint32 fLastSequence; 162 tcp_state fState; 163 uint8 fFlags; 164 }; 165 166 class State : public AbstractTraceEntry { 167 public: 168 State(TCPEndpoint* endpoint) 169 : 170 fEndpoint(endpoint), 171 fState(endpoint->State()) 172 { 173 Initialized(); 174 } 175 176 virtual void AddDump(TraceOutput& out) 177 { 178 out.Print("tcp:%p (%12s) state change", fEndpoint, 179 name_for_state(fState)); 180 } 181 182 protected: 183 TCPEndpoint* fEndpoint; 184 tcp_state fState; 185 }; 186 187 class Spawn : public AbstractTraceEntry { 188 public: 189 Spawn(TCPEndpoint* listeningEndpoint, TCPEndpoint* spawnedEndpoint) 190 : 191 fListeningEndpoint(listeningEndpoint), 192 fSpawnedEndpoint(spawnedEndpoint) 193 { 194 Initialized(); 195 } 196 197 virtual void AddDump(TraceOutput& out) 198 { 199 out.Print("tcp:%p spawns %p", fListeningEndpoint, fSpawnedEndpoint); 200 } 201 202 protected: 203 TCPEndpoint* fListeningEndpoint; 204 TCPEndpoint* fSpawnedEndpoint; 205 }; 206 207 class Error : public AbstractTraceEntry { 208 public: 209 Error(TCPEndpoint* endpoint, const char* error, int32 line) 210 : 211 fEndpoint(endpoint), 212 fLine(line), 213 fError(error), 214 fState(endpoint->State()) 215 { 216 Initialized(); 217 } 218 219 virtual void AddDump(TraceOutput& out) 220 { 221 out.Print("tcp:%p (%12s) error at line %" B_PRId32 ": %s", fEndpoint, 222 name_for_state(fState), fLine, fError); 223 } 224 225 protected: 226 TCPEndpoint* fEndpoint; 227 int32 fLine; 228 const char* fError; 229 tcp_state fState; 230 }; 231 232 class TimerSet : public AbstractTraceEntry { 233 public: 234 TimerSet(TCPEndpoint* endpoint, const char* which, bigtime_t timeout) 235 : 236 fEndpoint(endpoint), 237 fWhich(which), 238 fTimeout(timeout), 239 fState(endpoint->State()) 240 { 241 Initialized(); 242 } 243 244 virtual void AddDump(TraceOutput& out) 245 { 246 out.Print("tcp:%p (%12s) %s timer set to %" B_PRIdBIGTIME, fEndpoint, 247 name_for_state(fState), fWhich, fTimeout); 248 } 249 250 protected: 251 TCPEndpoint* fEndpoint; 252 const char* fWhich; 253 bigtime_t fTimeout; 254 tcp_state fState; 255 }; 256 257 class TimerTriggered : public AbstractTraceEntry { 258 public: 259 TimerTriggered(TCPEndpoint* endpoint, const char* which) 260 : 261 fEndpoint(endpoint), 262 fWhich(which), 263 fState(endpoint->State()) 264 { 265 Initialized(); 266 } 267 268 virtual void AddDump(TraceOutput& out) 269 { 270 out.Print("tcp:%p (%12s) %s timer triggered", fEndpoint, 271 name_for_state(fState), fWhich); 272 } 273 274 protected: 275 TCPEndpoint* fEndpoint; 276 const char* fWhich; 277 tcp_state fState; 278 }; 279 280 class APICall : public AbstractTraceEntry { 281 public: 282 APICall(TCPEndpoint* endpoint, const char* which) 283 : 284 fEndpoint(endpoint), 285 fWhich(which), 286 fState(endpoint->State()) 287 { 288 Initialized(); 289 } 290 291 virtual void AddDump(TraceOutput& out) 292 { 293 out.Print("tcp:%p (%12s) api call: %s", fEndpoint, 294 name_for_state(fState), fWhich); 295 } 296 297 protected: 298 TCPEndpoint* fEndpoint; 299 const char* fWhich; 300 tcp_state fState; 301 }; 302 303 } // namespace TCPTracing 304 305 # define T(x) new(std::nothrow) TCPTracing::x 306 #else 307 # define T(x) 308 #endif // TCP_TRACING 309 310 311 // constants for the fFlags field 312 enum { 313 FLAG_OPTION_WINDOW_SCALE = 0x01, 314 FLAG_OPTION_TIMESTAMP = 0x02, 315 // TODO: Should FLAG_NO_RECEIVE apply as well to received connections? 316 // That is, what is expected from accept() after a shutdown() 317 // is performed on a listen()ing socket. 318 FLAG_NO_RECEIVE = 0x04, 319 FLAG_CLOSED = 0x08, 320 FLAG_DELETE_ON_CLOSE = 0x10, 321 FLAG_LOCAL = 0x20, 322 FLAG_RECOVERY = 0x40, 323 FLAG_OPTION_SACK_PERMITTED = 0x80, 324 }; 325 326 327 static const int kTimestampFactor = 1000; 328 // conversion factor between usec system time and msec tcp time 329 330 331 static inline bigtime_t 332 absolute_timeout(bigtime_t timeout) 333 { 334 if (timeout == 0 || timeout == B_INFINITE_TIMEOUT) 335 return timeout; 336 337 return timeout + system_time(); 338 } 339 340 341 static inline status_t 342 posix_error(status_t error) 343 { 344 if (error == B_TIMED_OUT) 345 return B_WOULD_BLOCK; 346 347 return error; 348 } 349 350 351 static inline bool 352 in_window(const tcp_sequence& sequence, const tcp_sequence& receiveNext, 353 uint32 receiveWindow) 354 { 355 return sequence >= receiveNext && sequence < (receiveNext + receiveWindow); 356 } 357 358 359 static inline bool 360 segment_in_sequence(const tcp_segment_header& segment, int size, 361 const tcp_sequence& receiveNext, uint32 receiveWindow) 362 { 363 tcp_sequence sequence(segment.sequence); 364 if (size == 0) { 365 if (receiveWindow == 0) 366 return sequence == receiveNext; 367 return in_window(sequence, receiveNext, receiveWindow); 368 } else { 369 if (receiveWindow == 0) 370 return false; 371 return in_window(sequence, receiveNext, receiveWindow) 372 || in_window(sequence + size - 1, receiveNext, receiveWindow); 373 } 374 } 375 376 377 static inline bool 378 is_writable(tcp_state state) 379 { 380 return state == ESTABLISHED || state == FINISH_RECEIVED; 381 } 382 383 384 static inline bool 385 is_establishing(tcp_state state) 386 { 387 return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED; 388 } 389 390 391 static inline uint32 tcp_now() 392 { 393 return system_time() / kTimestampFactor; 394 } 395 396 397 static inline uint32 tcp_diff_timestamp(uint32 base) 398 { 399 uint32 now = tcp_now(); 400 401 if (now > base) 402 return now - base; 403 404 return now + UINT_MAX - base; 405 } 406 407 408 static inline bool 409 state_needs_finish(int32 state) 410 { 411 return state == WAIT_FOR_FINISH_ACKNOWLEDGE 412 || state == FINISH_SENT || state == CLOSING; 413 } 414 415 416 // #pragma mark - 417 418 419 TCPEndpoint::TCPEndpoint(net_socket* socket) 420 : 421 ProtocolSocket(socket), 422 fManager(NULL), 423 fOptions(0), 424 fSendWindowShift(0), 425 fReceiveWindowShift(0), 426 fSendUnacknowledged(0), 427 fSendNext(0), 428 fSendMax(0), 429 fSendUrgentOffset(0), 430 fSendWindow(0), 431 fSendMaxWindow(0), 432 fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE), 433 fSendMaxSegments(0), 434 fSendQueue(socket->send.buffer_size), 435 fInitialSendSequence(0), 436 fPreviousHighestAcknowledge(0), 437 fDuplicateAcknowledgeCount(0), 438 fPreviousFlightSize(0), 439 fRecover(0), 440 fRoute(NULL), 441 fReceiveNext(0), 442 fReceiveMaxAdvertised(0), 443 fReceiveWindow(socket->receive.buffer_size), 444 fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE), 445 fReceiveQueue(socket->receive.buffer_size), 446 fSmoothedRoundTripTime(0), 447 fRoundTripVariation(0), 448 fSendTime(0), 449 fRoundTripStartSequence(0), 450 fRetransmitTimeout(TCP_INITIAL_RTT), 451 fReceivedTimestamp(0), 452 fCongestionWindow(0), 453 fSlowStartThreshold(0), 454 fState(CLOSED), 455 fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP | FLAG_OPTION_SACK_PERMITTED) 456 { 457 // TODO: to be replaced with a real read/write locking strategy! 458 mutex_init(&fLock, "tcp lock"); 459 460 fReceiveCondition.Init(this, "tcp receive"); 461 fSendCondition.Init(this, "tcp send"); 462 463 gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this); 464 gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer, 465 this); 466 gStackModule->init_timer(&fDelayedAcknowledgeTimer, 467 TCPEndpoint::_DelayedAcknowledgeTimer, this); 468 gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer, 469 this); 470 471 T(APICall(this, "constructor")); 472 } 473 474 475 TCPEndpoint::~TCPEndpoint() 476 { 477 mutex_lock(&fLock); 478 479 T(APICall(this, "destructor")); 480 481 _CancelConnectionTimers(); 482 gStackModule->cancel_timer(&fTimeWaitTimer); 483 T(TimerSet(this, "time-wait", -1)); 484 485 if (fManager != NULL) { 486 fManager->Unbind(this); 487 put_endpoint_manager(fManager); 488 } 489 490 mutex_destroy(&fLock); 491 492 // we need to wait for all timers to return 493 gStackModule->wait_for_timer(&fRetransmitTimer); 494 gStackModule->wait_for_timer(&fPersistTimer); 495 gStackModule->wait_for_timer(&fDelayedAcknowledgeTimer); 496 gStackModule->wait_for_timer(&fTimeWaitTimer); 497 498 gDatalinkModule->put_route(Domain(), fRoute); 499 } 500 501 502 status_t 503 TCPEndpoint::InitCheck() const 504 { 505 return B_OK; 506 } 507 508 509 // #pragma mark - protocol API 510 511 512 status_t 513 TCPEndpoint::Open() 514 { 515 TRACE("Open()"); 516 T(APICall(this, "open")); 517 518 status_t status = ProtocolSocket::Open(); 519 if (status < B_OK) 520 return status; 521 522 fManager = get_endpoint_manager(Domain()); 523 if (fManager == NULL) 524 return EAFNOSUPPORT; 525 526 return B_OK; 527 } 528 529 530 status_t 531 TCPEndpoint::Close() 532 { 533 MutexLocker locker(fLock); 534 535 TRACE("Close()"); 536 T(APICall(this, "close")); 537 538 if (fState == LISTEN) 539 delete_sem(fAcceptSemaphore); 540 541 if (fState == SYNCHRONIZE_SENT || fState == LISTEN) { 542 // TODO: what about linger in case of SYNCHRONIZE_SENT? 543 fState = CLOSED; 544 T(State(this)); 545 return B_OK; 546 } 547 548 // handle linger with zero timeout 549 if ((socket->options & SO_LINGER) != 0 && socket->linger == 0) { 550 fState = CLOSED; 551 T(State(this)); 552 return _SendQueued(true); 553 } 554 555 status_t status = _Disconnect(true); 556 if (status != B_OK) 557 return status; 558 559 if ((socket->options & SO_LINGER) != 0) { 560 TRACE("Close(): Lingering for %i secs", socket->linger); 561 562 bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL); 563 564 while (fSendQueue.Used() > 0) { 565 status = _WaitForCondition(fSendCondition, locker, maximum); 566 if (status == B_TIMED_OUT || status == B_WOULD_BLOCK) 567 break; 568 else if (status < B_OK) 569 return status; 570 } 571 572 TRACE("Close(): after waiting, the SendQ was left with %" B_PRIuSIZE 573 " bytes.", fSendQueue.Used()); 574 } 575 return B_OK; 576 } 577 578 579 void 580 TCPEndpoint::Free() 581 { 582 MutexLocker _(fLock); 583 584 TRACE("Free()"); 585 T(APICall(this, "free")); 586 587 if (fState <= SYNCHRONIZE_SENT) 588 return; 589 590 // we are only interested in the timer, not in changing state 591 _EnterTimeWait(); 592 593 fFlags |= FLAG_CLOSED; 594 if ((fFlags & FLAG_DELETE_ON_CLOSE) == 0) { 595 // we'll be freed later when the 2MSL timer expires 596 gSocketModule->acquire_socket(socket); 597 } 598 } 599 600 601 /*! Creates and sends a synchronize packet to /a address, and then waits 602 until the connection has been established or refused. 603 */ 604 status_t 605 TCPEndpoint::Connect(const sockaddr* address) 606 { 607 if (!AddressModule()->is_same_family(address)) 608 return EAFNOSUPPORT; 609 610 MutexLocker locker(fLock); 611 612 TRACE("Connect() on address %s", PrintAddress(address)); 613 T(APICall(this, "connect")); 614 615 if (gStackModule->is_restarted_syscall()) { 616 bigtime_t timeout = gStackModule->restore_syscall_restart_timeout(); 617 status_t status = _WaitForEstablished(locker, timeout); 618 TRACE(" Connect(): Connection complete: %s (timeout was %" 619 B_PRIdBIGTIME ")", strerror(status), timeout); 620 return posix_error(status); 621 } 622 623 // Can only call connect() from CLOSED or LISTEN states 624 // otherwise endpoint is considered already connected 625 if (fState == LISTEN) { 626 // this socket is about to connect; remove pending connections in the backlog 627 gSocketModule->set_max_backlog(socket, 0); 628 } else if (fState == ESTABLISHED) { 629 return EISCONN; 630 } else if (fState != CLOSED) 631 return EALREADY; 632 633 // consider destination address INADDR_ANY as INADDR_LOOPBACK 634 sockaddr_storage _address; 635 if (AddressModule()->is_empty_address(address, false)) { 636 AddressModule()->get_loopback_address((sockaddr *)&_address); 637 // for IPv4 and IPv6 the port is at the same offset 638 ((sockaddr_in &)_address).sin_port = ((sockaddr_in *)address)->sin_port; 639 address = (sockaddr *)&_address; 640 } 641 642 status_t status = _PrepareSendPath(address); 643 if (status < B_OK) 644 return status; 645 646 TRACE(" Connect(): starting 3-way handshake..."); 647 648 fState = SYNCHRONIZE_SENT; 649 T(State(this)); 650 651 // send SYN 652 status = _SendAcknowledge(); 653 if (status != B_OK) { 654 _Close(); 655 return status; 656 } 657 658 // If we are running over Loopback, after _SendQueued() returns we 659 // may be in ESTABLISHED already. 660 if (fState == ESTABLISHED) { 661 TRACE(" Connect() completed after _SendQueued()"); 662 return B_OK; 663 } 664 665 // wait until 3-way handshake is complete (if needed) 666 bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT); 667 if (timeout == 0) { 668 // we're a non-blocking socket 669 TRACE(" Connect() delayed, return EINPROGRESS"); 670 return EINPROGRESS; 671 } 672 673 bigtime_t absoluteTimeout = absolute_timeout(timeout); 674 gStackModule->store_syscall_restart_timeout(absoluteTimeout); 675 676 status = _WaitForEstablished(locker, absoluteTimeout); 677 TRACE(" Connect(): Connection complete: %s (timeout was %" B_PRIdBIGTIME 678 ")", strerror(status), timeout); 679 return posix_error(status); 680 } 681 682 683 status_t 684 TCPEndpoint::Accept(struct net_socket** _acceptedSocket) 685 { 686 MutexLocker locker(fLock); 687 688 TRACE("Accept()"); 689 T(APICall(this, "accept")); 690 691 status_t status; 692 bigtime_t timeout = absolute_timeout(socket->receive.timeout); 693 if (gStackModule->is_restarted_syscall()) 694 timeout = gStackModule->restore_syscall_restart_timeout(); 695 else 696 gStackModule->store_syscall_restart_timeout(timeout); 697 698 do { 699 locker.Unlock(); 700 701 status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT 702 | B_CAN_INTERRUPT, timeout); 703 if (status != B_OK) { 704 if (status == B_TIMED_OUT && socket->receive.timeout == 0) 705 return B_WOULD_BLOCK; 706 707 return status; 708 } 709 710 locker.Lock(); 711 status = gSocketModule->dequeue_connected(socket, _acceptedSocket); 712 #ifdef TRACE_TCP 713 if (status == B_OK) 714 TRACE(" Accept() returning %p", (*_acceptedSocket)->first_protocol); 715 #endif 716 } while (status != B_OK); 717 718 return status; 719 } 720 721 722 status_t 723 TCPEndpoint::Bind(const sockaddr *address) 724 { 725 if (address == NULL) 726 return B_BAD_VALUE; 727 728 MutexLocker lock(fLock); 729 730 TRACE("Bind() on address %s", PrintAddress(address)); 731 T(APICall(this, "bind")); 732 733 if (fState != CLOSED) 734 return EISCONN; 735 736 return fManager->Bind(this, address); 737 } 738 739 740 status_t 741 TCPEndpoint::Unbind(struct sockaddr *address) 742 { 743 MutexLocker _(fLock); 744 745 TRACE("Unbind()"); 746 T(APICall(this, "unbind")); 747 748 return fManager->Unbind(this); 749 } 750 751 752 status_t 753 TCPEndpoint::Listen(int count) 754 { 755 MutexLocker _(fLock); 756 757 TRACE("Listen()"); 758 T(APICall(this, "listen")); 759 760 if (fState != CLOSED && fState != LISTEN) 761 return B_BAD_VALUE; 762 763 if (fState == CLOSED) { 764 fAcceptSemaphore = create_sem(0, "tcp accept"); 765 if (fAcceptSemaphore < B_OK) 766 return ENOBUFS; 767 768 status_t status = fManager->SetPassive(this); 769 if (status != B_OK) { 770 delete_sem(fAcceptSemaphore); 771 fAcceptSemaphore = -1; 772 return status; 773 } 774 } 775 776 gSocketModule->set_max_backlog(socket, count); 777 778 fState = LISTEN; 779 T(State(this)); 780 return B_OK; 781 } 782 783 784 status_t 785 TCPEndpoint::Shutdown(int direction) 786 { 787 MutexLocker lock(fLock); 788 789 TRACE("Shutdown(%i)", direction); 790 T(APICall(this, "shutdown")); 791 792 if (direction == SHUT_RD || direction == SHUT_RDWR) 793 fFlags |= FLAG_NO_RECEIVE; 794 795 if (direction == SHUT_WR || direction == SHUT_RDWR) { 796 // TODO: That's not correct. After read/write shutting down the socket 797 // one should still be able to read previously arrived data. 798 _Disconnect(false); 799 } 800 801 return B_OK; 802 } 803 804 805 /*! Puts data contained in \a buffer into send buffer */ 806 status_t 807 TCPEndpoint::SendData(net_buffer *buffer) 808 { 809 MutexLocker lock(fLock); 810 811 TRACE("SendData(buffer %p, size %" B_PRIu32 ", flags %#" B_PRIx32 812 ") [total %" B_PRIuSIZE " bytes, has %" B_PRIuSIZE "]", buffer, 813 buffer->size, buffer->flags, fSendQueue.Size(), fSendQueue.Free()); 814 T(APICall(this, "senddata")); 815 816 const uint32 flags = buffer->flags; 817 if ((flags & ~(MSG_DONTWAIT | MSG_OOB | MSG_EOF)) != 0) 818 return EOPNOTSUPP; 819 820 if (fState == CLOSED) 821 return ENOTCONN; 822 if (fState == LISTEN) 823 return EDESTADDRREQ; 824 if (!is_writable(fState) && !is_establishing(fState)) 825 return EPIPE; 826 827 size_t left = buffer->size; 828 829 bigtime_t timeout = 0; 830 if ((flags & MSG_DONTWAIT) == 0) { 831 timeout = absolute_timeout(socket->send.timeout); 832 if (gStackModule->is_restarted_syscall()) 833 timeout = gStackModule->restore_syscall_restart_timeout(); 834 else 835 gStackModule->store_syscall_restart_timeout(timeout); 836 } 837 838 while (left > 0) { 839 while (fSendQueue.Free() < socket->send.low_water_mark) { 840 // initiate a send before waiting 841 _SendQueued(); 842 843 // wait until enough space is available 844 status_t status = _WaitForCondition(fSendCondition, lock, timeout); 845 if (status < B_OK) { 846 TRACE(" SendData() returning %s (%d)", 847 strerror(posix_error(status)), (int)posix_error(status)); 848 return posix_error(status); 849 } 850 851 if (!is_writable(fState) && !is_establishing(fState)) 852 return EPIPE; 853 } 854 855 size_t size = fSendQueue.Free(); 856 if (size < left) { 857 // we need to split the original buffer 858 net_buffer* clone = gBufferModule->clone(buffer, false); 859 // TODO: add offset/size parameter to net_buffer::clone() or 860 // even a move_data() function, as this is a bit inefficient 861 if (clone == NULL) 862 return ENOBUFS; 863 864 status_t status = gBufferModule->trim(clone, size); 865 if (status != B_OK) { 866 gBufferModule->free(clone); 867 return status; 868 } 869 870 gBufferModule->remove_header(buffer, size); 871 left -= size; 872 fSendQueue.Add(clone); 873 } else { 874 left -= buffer->size; 875 fSendQueue.Add(buffer); 876 } 877 } 878 879 TRACE(" SendData(): %" B_PRIuSIZE " bytes used.", fSendQueue.Used()); 880 881 bool force = false; 882 if ((flags & MSG_OOB) != 0) { 883 fSendUrgentOffset = fSendQueue.LastSequence(); 884 // RFC 961 specifies that the urgent offset points to the last 885 // byte of urgent data. However, this is commonly implemented as 886 // here, ie. it points to the first byte after the urgent data. 887 force = true; 888 } 889 if ((flags & MSG_EOF) != 0) 890 _Disconnect(false); 891 892 _SendQueued(force); 893 894 return B_OK; 895 } 896 897 898 ssize_t 899 TCPEndpoint::SendAvailable() 900 { 901 MutexLocker locker(fLock); 902 903 ssize_t available; 904 905 if (is_writable(fState)) 906 available = fSendQueue.Free(); 907 else if (is_establishing(fState)) 908 available = 0; 909 else 910 available = EPIPE; 911 912 TRACE("SendAvailable(): %" B_PRIdSSIZE, available); 913 T(APICall(this, "sendavailable")); 914 return available; 915 } 916 917 918 status_t 919 TCPEndpoint::FillStat(net_stat *stat) 920 { 921 MutexLocker _(fLock); 922 923 strlcpy(stat->state, name_for_state(fState), sizeof(stat->state)); 924 stat->receive_queue_size = fReceiveQueue.Available(); 925 stat->send_queue_size = fSendQueue.Used(); 926 927 return B_OK; 928 } 929 930 931 status_t 932 TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer) 933 { 934 if ((flags & ~(MSG_DONTWAIT | MSG_WAITALL | MSG_PEEK)) != 0) 935 return EOPNOTSUPP; 936 937 MutexLocker locker(fLock); 938 939 TRACE("ReadData(%" B_PRIuSIZE " bytes, flags %#" B_PRIx32 ")", numBytes, 940 flags); 941 T(APICall(this, "readdata")); 942 943 *_buffer = NULL; 944 945 if (fState == CLOSED || fState == LISTEN) { 946 if (socket->error != B_OK) 947 return socket->error; 948 return ENOTCONN; 949 } 950 951 bigtime_t timeout = 0; 952 if ((flags & MSG_DONTWAIT) == 0) { 953 timeout = absolute_timeout(socket->receive.timeout); 954 if (gStackModule->is_restarted_syscall()) 955 timeout = gStackModule->restore_syscall_restart_timeout(); 956 else 957 gStackModule->store_syscall_restart_timeout(timeout); 958 } 959 960 if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) { 961 if (flags & MSG_DONTWAIT) 962 return B_WOULD_BLOCK; 963 964 status_t status = _WaitForEstablished(locker, timeout); 965 if (status < B_OK) 966 return posix_error(status); 967 } 968 969 size_t dataNeeded = socket->receive.low_water_mark; 970 971 // When MSG_WAITALL is set then the function should block 972 // until the full amount of data can be returned. 973 if (flags & MSG_WAITALL) 974 dataNeeded = numBytes; 975 976 // TODO: add support for urgent data (MSG_OOB) 977 978 while (true) { 979 if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE 980 || fState == TIME_WAIT) { 981 // ``Connection closing''. 982 if (fReceiveQueue.Available() > 0) 983 break; 984 return B_OK; 985 } 986 987 if (fReceiveQueue.Available() > 0) { 988 if (fReceiveQueue.Available() >= dataNeeded 989 || (fReceiveQueue.PushedData() > 0 990 && fReceiveQueue.PushedData() >= fReceiveQueue.Available())) 991 break; 992 } else if (fState == FINISH_RECEIVED) { 993 // ``If no text is awaiting delivery, the RECEIVE will 994 // get a Connection closing''. 995 return B_OK; 996 } 997 998 if (timeout == 0) 999 return B_WOULD_BLOCK; 1000 1001 if ((fFlags & FLAG_NO_RECEIVE) != 0) 1002 return B_OK; 1003 1004 status_t status = _WaitForCondition(fReceiveCondition, locker, timeout); 1005 if (status < B_OK) { 1006 // The Open Group base specification mentions that EINTR should be 1007 // returned if the recv() is interrupted before _any data_ is 1008 // available. So we actually check if there is data, and if so, 1009 // push it to the user. 1010 if ((status == B_TIMED_OUT || status == B_INTERRUPTED) 1011 && fReceiveQueue.Available() > 0) 1012 break; 1013 1014 return posix_error(status); 1015 } 1016 } 1017 1018 TRACE(" ReadData(): %" B_PRIuSIZE " are available.", 1019 fReceiveQueue.Available()); 1020 1021 if (numBytes < fReceiveQueue.Available()) 1022 fReceiveCondition.NotifyAll(); 1023 1024 bool clone = (flags & MSG_PEEK) != 0; 1025 1026 ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer); 1027 1028 TRACE(" ReadData(): %" B_PRIuSIZE " bytes kept.", 1029 fReceiveQueue.Available()); 1030 1031 if (fReceiveQueue.Available() == 0 && fState == FINISH_RECEIVED) 1032 socket->receive.low_water_mark = 0; 1033 1034 // if we are opening the window, check if we should send an ACK 1035 if (!clone) 1036 _SendAcknowledge(); 1037 1038 return receivedBytes; 1039 } 1040 1041 1042 ssize_t 1043 TCPEndpoint::ReadAvailable() 1044 { 1045 MutexLocker locker(fLock); 1046 1047 TRACE("ReadAvailable(): %" B_PRIdSSIZE, _AvailableData()); 1048 T(APICall(this, "readavailable")); 1049 1050 return _AvailableData(); 1051 } 1052 1053 1054 status_t 1055 TCPEndpoint::SetSendBufferSize(size_t length) 1056 { 1057 MutexLocker _(fLock); 1058 fSendQueue.SetMaxBytes(length); 1059 return B_OK; 1060 } 1061 1062 1063 status_t 1064 TCPEndpoint::SetReceiveBufferSize(size_t length) 1065 { 1066 MutexLocker _(fLock); 1067 fReceiveQueue.SetMaxBytes(length); 1068 return B_OK; 1069 } 1070 1071 1072 status_t 1073 TCPEndpoint::GetOption(int option, void* _value, int* _length) 1074 { 1075 if (*_length != sizeof(int)) 1076 return B_BAD_VALUE; 1077 1078 int* value = (int*)_value; 1079 1080 switch (option) { 1081 case TCP_NODELAY: 1082 if ((fOptions & TCP_NODELAY) != 0) 1083 *value = 1; 1084 else 1085 *value = 0; 1086 return B_OK; 1087 1088 case TCP_MAXSEG: 1089 *value = fReceiveMaxSegmentSize; 1090 return B_OK; 1091 1092 default: 1093 return B_BAD_VALUE; 1094 } 1095 } 1096 1097 1098 status_t 1099 TCPEndpoint::SetOption(int option, const void* _value, int length) 1100 { 1101 if (option != TCP_NODELAY) 1102 return B_BAD_VALUE; 1103 1104 if (length != sizeof(int)) 1105 return B_BAD_VALUE; 1106 1107 const int* value = (const int*)_value; 1108 1109 MutexLocker _(fLock); 1110 if (*value) 1111 fOptions |= TCP_NODELAY; 1112 else 1113 fOptions &= ~TCP_NODELAY; 1114 1115 return B_OK; 1116 } 1117 1118 1119 // #pragma mark - misc 1120 1121 1122 bool 1123 TCPEndpoint::IsBound() const 1124 { 1125 return !LocalAddress().IsEmpty(true); 1126 } 1127 1128 1129 bool 1130 TCPEndpoint::IsLocal() const 1131 { 1132 return (fFlags & FLAG_LOCAL) != 0; 1133 } 1134 1135 1136 status_t 1137 TCPEndpoint::DelayedAcknowledge() 1138 { 1139 // ACKs "MUST" be generated within 500ms of the first unACKed packet, and 1140 // "SHOULD" be for at least every second full-size segment. (RFC 5681 § 4.2) 1141 1142 bigtime_t delay = TCP_DELAYED_ACKNOWLEDGE_TIMEOUT; 1143 if ((fReceiveNext - fLastAcknowledgeSent) >= (fReceiveMaxSegmentSize * 2)) { 1144 // Trigger an immediate timeout rather than invoking Send directly, 1145 // allowing multiple invocations to be coalesced. 1146 delay = 0; 1147 } else if (gStackModule->is_timer_active(&fDelayedAcknowledgeTimer)) { 1148 return B_OK; 1149 } 1150 1151 gStackModule->set_timer(&fDelayedAcknowledgeTimer, delay); 1152 T(TimerSet(this, "delayed ack", TCP_DELAYED_ACKNOWLEDGE_TIMEOUT)); 1153 return B_OK; 1154 } 1155 1156 1157 void 1158 TCPEndpoint::_StartPersistTimer() 1159 { 1160 gStackModule->set_timer(&fPersistTimer, TCP_PERSIST_TIMEOUT); 1161 T(TimerSet(this, "persist", TCP_PERSIST_TIMEOUT)); 1162 } 1163 1164 1165 void 1166 TCPEndpoint::_EnterTimeWait() 1167 { 1168 TRACE("_EnterTimeWait()"); 1169 1170 if (fState == TIME_WAIT) { 1171 _CancelConnectionTimers(); 1172 } 1173 1174 _UpdateTimeWait(); 1175 } 1176 1177 1178 void 1179 TCPEndpoint::_UpdateTimeWait() 1180 { 1181 gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1); 1182 T(TimerSet(this, "time-wait", TCP_MAX_SEGMENT_LIFETIME << 1)); 1183 } 1184 1185 1186 void 1187 TCPEndpoint::_CancelConnectionTimers() 1188 { 1189 gStackModule->cancel_timer(&fRetransmitTimer); 1190 T(TimerSet(this, "retransmit", -1)); 1191 gStackModule->cancel_timer(&fPersistTimer); 1192 T(TimerSet(this, "persist", -1)); 1193 gStackModule->cancel_timer(&fDelayedAcknowledgeTimer); 1194 T(TimerSet(this, "delayed ack", -1)); 1195 } 1196 1197 1198 /*! Sends the FIN flag to the peer when the connection is still open. 1199 Moves the endpoint to the next state depending on where it was. 1200 */ 1201 status_t 1202 TCPEndpoint::_Disconnect(bool closing) 1203 { 1204 tcp_state previousState = fState; 1205 1206 if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED) 1207 fState = FINISH_SENT; 1208 else if (fState == FINISH_RECEIVED) 1209 fState = WAIT_FOR_FINISH_ACKNOWLEDGE; 1210 else 1211 return B_OK; 1212 1213 T(State(this)); 1214 1215 status_t status = _SendQueued(); 1216 if (status != B_OK) { 1217 fState = previousState; 1218 T(State(this)); 1219 return status; 1220 } 1221 1222 return B_OK; 1223 } 1224 1225 1226 void 1227 TCPEndpoint::_MarkEstablished() 1228 { 1229 fState = ESTABLISHED; 1230 T(State(this)); 1231 1232 gSocketModule->set_connected(socket); 1233 if (gSocketModule->has_parent(socket)) 1234 release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE); 1235 1236 fSendCondition.NotifyAll(); 1237 gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Free()); 1238 } 1239 1240 1241 status_t 1242 TCPEndpoint::_WaitForEstablished(MutexLocker &locker, bigtime_t timeout) 1243 { 1244 // TODO: Checking for CLOSED seems correct, but breaks several neon tests. 1245 // When investigating this, also have a look at _Close() and _HandleReset(). 1246 while (fState < ESTABLISHED/* && fState != CLOSED*/) { 1247 if (socket->error != B_OK) 1248 return socket->error; 1249 1250 status_t status = _WaitForCondition(fSendCondition, locker, timeout); 1251 if (status < B_OK) 1252 return status; 1253 } 1254 1255 return B_OK; 1256 } 1257 1258 1259 // #pragma mark - receive 1260 1261 1262 void 1263 TCPEndpoint::_Close() 1264 { 1265 _CancelConnectionTimers(); 1266 fState = CLOSED; 1267 T(State(this)); 1268 1269 fFlags |= FLAG_DELETE_ON_CLOSE; 1270 1271 fSendCondition.NotifyAll(); 1272 _NotifyReader(); 1273 1274 if (gSocketModule->has_parent(socket)) { 1275 // We still have a parent - obviously, we haven't been accepted yet, 1276 // so no one could ever close us. 1277 _CancelConnectionTimers(); 1278 gSocketModule->set_aborted(socket); 1279 } 1280 } 1281 1282 1283 void 1284 TCPEndpoint::_HandleReset(status_t error) 1285 { 1286 socket->error = error; 1287 _Close(); 1288 1289 gSocketModule->notify(socket, B_SELECT_WRITE, error); 1290 gSocketModule->notify(socket, B_SELECT_ERROR, error); 1291 } 1292 1293 1294 void 1295 TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment) 1296 { 1297 if (fDuplicateAcknowledgeCount == 0) 1298 fPreviousFlightSize = (fSendMax - fSendUnacknowledged).Number(); 1299 1300 if (++fDuplicateAcknowledgeCount < 3) { 1301 if (fSendQueue.Available(fSendMax) != 0 && fSendWindow != 0) { 1302 fSendNext = fSendMax; 1303 fCongestionWindow += fDuplicateAcknowledgeCount * fSendMaxSegmentSize; 1304 _SendQueued(); 1305 TRACE("_DuplicateAcknowledge(): packet sent under limited transmit on receipt of dup ack"); 1306 fCongestionWindow -= fDuplicateAcknowledgeCount * fSendMaxSegmentSize; 1307 } 1308 } 1309 1310 if (fDuplicateAcknowledgeCount == 3) { 1311 if ((segment.acknowledge - 1) > fRecover || (fCongestionWindow > fSendMaxSegmentSize && 1312 (fSendUnacknowledged - fPreviousHighestAcknowledge) <= 4 * fSendMaxSegmentSize)) { 1313 fFlags |= FLAG_RECOVERY; 1314 fRecover = fSendMax.Number() - 1; 1315 fSlowStartThreshold = max_c(fPreviousFlightSize / 2, 2 * fSendMaxSegmentSize); 1316 fCongestionWindow = fSlowStartThreshold + 3 * fSendMaxSegmentSize; 1317 fSendNext = segment.acknowledge; 1318 _SendQueued(); 1319 TRACE("_DuplicateAcknowledge(): packet sent under fast restransmit on the receipt of 3rd dup ack"); 1320 } 1321 } else if (fDuplicateAcknowledgeCount > 3) { 1322 uint32 flightSize = (fSendMax - fSendUnacknowledged).Number(); 1323 if ((fDuplicateAcknowledgeCount - 3) * fSendMaxSegmentSize <= flightSize) 1324 fCongestionWindow += fSendMaxSegmentSize; 1325 if (fSendQueue.Available(fSendMax) != 0) { 1326 fSendNext = fSendMax; 1327 _SendQueued(); 1328 } 1329 } 1330 } 1331 1332 1333 void 1334 TCPEndpoint::_UpdateTimestamps(tcp_segment_header& segment, 1335 size_t segmentLength) 1336 { 1337 if (fFlags & FLAG_OPTION_TIMESTAMP) { 1338 tcp_sequence sequence(segment.sequence); 1339 1340 if (fLastAcknowledgeSent >= sequence 1341 && fLastAcknowledgeSent < (sequence + segmentLength)) 1342 fReceivedTimestamp = segment.timestamp_value; 1343 } 1344 } 1345 1346 1347 ssize_t 1348 TCPEndpoint::_AvailableData() const 1349 { 1350 // TODO: Refer to the FLAG_NO_RECEIVE comment above regarding 1351 // the application of FLAG_NO_RECEIVE in listen()ing 1352 // sockets. 1353 if (fState == LISTEN) 1354 return gSocketModule->count_connected(socket); 1355 if (fState == SYNCHRONIZE_SENT) 1356 return 0; 1357 1358 ssize_t availableData = fReceiveQueue.Available(); 1359 1360 if (availableData == 0 && !_ShouldReceive()) 1361 return ENOTCONN; 1362 if (availableData == 0 && (fState == FINISH_RECEIVED || fState == WAIT_FOR_FINISH_ACKNOWLEDGE)) 1363 return ESHUTDOWN; 1364 return availableData; 1365 } 1366 1367 1368 void 1369 TCPEndpoint::_NotifyReader() 1370 { 1371 fReceiveCondition.NotifyAll(); 1372 gSocketModule->notify(socket, B_SELECT_READ, _AvailableData()); 1373 } 1374 1375 1376 bool 1377 TCPEndpoint::_AddData(tcp_segment_header& segment, net_buffer* buffer) 1378 { 1379 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1380 // Remember the position of the finish received flag 1381 fFinishReceived = true; 1382 fFinishReceivedAt = segment.sequence + buffer->size; 1383 } 1384 1385 fReceiveQueue.Add(buffer, segment.sequence); 1386 fReceiveNext = fReceiveQueue.NextSequence(); 1387 1388 if (fFinishReceived) { 1389 // Set or reset the finish flag on the current segment 1390 if (fReceiveNext < fFinishReceivedAt) 1391 segment.flags &= ~TCP_FLAG_FINISH; 1392 else 1393 segment.flags |= TCP_FLAG_FINISH; 1394 } 1395 1396 TRACE(" _AddData(): adding data, receive next = %" B_PRIu32 ". Now have %" 1397 B_PRIuSIZE " bytes.", fReceiveNext.Number(), fReceiveQueue.Available()); 1398 1399 if ((segment.flags & TCP_FLAG_PUSH) != 0) 1400 fReceiveQueue.SetPushPointer(); 1401 1402 return fReceiveQueue.Available() > 0; 1403 } 1404 1405 1406 void 1407 TCPEndpoint::_PrepareReceivePath(tcp_segment_header& segment) 1408 { 1409 fInitialReceiveSequence = segment.sequence; 1410 fFinishReceived = false; 1411 1412 // count the received SYN 1413 segment.sequence++; 1414 1415 fReceiveNext = segment.sequence; 1416 fReceiveQueue.SetInitialSequence(segment.sequence); 1417 1418 if ((fOptions & TCP_NOOPT) == 0) { 1419 if (segment.max_segment_size > 0) { 1420 // The maximum size of a segment that a TCP endpoint really sends, 1421 // the "effective send MSS", MUST be the smaller of the send MSS and 1422 // the largest transmission size permitted by the IP layer: 1423 fSendMaxSegmentSize = min_c(segment.max_segment_size, 1424 _MaxSegmentSize(*PeerAddress())); 1425 } 1426 1427 if (segment.options & TCP_HAS_WINDOW_SCALE) { 1428 fFlags |= FLAG_OPTION_WINDOW_SCALE; 1429 fSendWindowShift = segment.window_shift; 1430 } else { 1431 fFlags &= ~FLAG_OPTION_WINDOW_SCALE; 1432 fReceiveWindowShift = 0; 1433 } 1434 1435 if (segment.options & TCP_HAS_TIMESTAMPS) { 1436 fFlags |= FLAG_OPTION_TIMESTAMP; 1437 fReceivedTimestamp = segment.timestamp_value; 1438 } else 1439 fFlags &= ~FLAG_OPTION_TIMESTAMP; 1440 1441 if ((segment.options & TCP_SACK_PERMITTED) == 0) 1442 fFlags &= ~FLAG_OPTION_SACK_PERMITTED; 1443 } 1444 1445 if (fSendMaxSegmentSize > 2190) 1446 fCongestionWindow = 2 * fSendMaxSegmentSize; 1447 else if (fSendMaxSegmentSize > 1095) 1448 fCongestionWindow = 3 * fSendMaxSegmentSize; 1449 else 1450 fCongestionWindow = 4 * fSendMaxSegmentSize; 1451 1452 fSendMaxSegments = fCongestionWindow / fSendMaxSegmentSize; 1453 fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift; 1454 } 1455 1456 1457 bool 1458 TCPEndpoint::_ShouldReceive() const 1459 { 1460 if ((fFlags & FLAG_NO_RECEIVE) != 0) 1461 return false; 1462 1463 return fState == ESTABLISHED || fState == FINISH_SENT 1464 || fState == FINISH_ACKNOWLEDGED || fState == FINISH_RECEIVED; 1465 } 1466 1467 1468 int32 1469 TCPEndpoint::_Spawn(TCPEndpoint* parent, tcp_segment_header& segment, 1470 net_buffer* buffer) 1471 { 1472 MutexLocker _(fLock); 1473 1474 TRACE("Spawn()"); 1475 1476 // TODO: proper error handling! 1477 if (ProtocolSocket::Open() != B_OK) { 1478 T(Error(this, "opening failed", __LINE__)); 1479 return DROP; 1480 } 1481 1482 fState = SYNCHRONIZE_RECEIVED; 1483 T(Spawn(parent, this)); 1484 1485 fManager = parent->fManager; 1486 1487 if (fManager->BindChild(this, buffer->destination) != B_OK) { 1488 T(Error(this, "binding failed", __LINE__)); 1489 return DROP; 1490 } 1491 if (_PrepareSendPath(buffer->source) != B_OK) { 1492 T(Error(this, "prepare send faild", __LINE__)); 1493 return DROP; 1494 } 1495 1496 fOptions = parent->fOptions; 1497 fAcceptSemaphore = parent->fAcceptSemaphore; 1498 1499 _PrepareReceivePath(segment); 1500 1501 // send SYN+ACK 1502 if (_SendAcknowledge() != B_OK) { 1503 T(Error(this, "sending failed", __LINE__)); 1504 return DROP; 1505 } 1506 1507 segment.flags &= ~TCP_FLAG_SYNCHRONIZE; 1508 // we handled this flag now, it must not be set for further processing 1509 1510 return _Receive(segment, buffer); 1511 } 1512 1513 1514 int32 1515 TCPEndpoint::_ListenReceive(tcp_segment_header& segment, net_buffer* buffer) 1516 { 1517 TRACE("ListenReceive()"); 1518 1519 // Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state, 1520 // but the error behaviour differs 1521 if (segment.flags & TCP_FLAG_RESET) 1522 return DROP; 1523 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) 1524 return DROP | RESET; 1525 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0) 1526 return DROP; 1527 1528 // TODO: drop broadcast/multicast 1529 1530 // spawn new endpoint for accept() 1531 net_socket* newSocket; 1532 if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK) { 1533 T(Error(this, "spawning failed", __LINE__)); 1534 return DROP; 1535 } 1536 1537 return ((TCPEndpoint *)newSocket->first_protocol)->_Spawn(this, 1538 segment, buffer); 1539 } 1540 1541 1542 int32 1543 TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment, 1544 net_buffer *buffer) 1545 { 1546 TRACE("_SynchronizeSentReceive()"); 1547 1548 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0 1549 && (fInitialSendSequence >= segment.acknowledge 1550 || fSendMax < segment.acknowledge)) 1551 return DROP | RESET; 1552 1553 if (segment.flags & TCP_FLAG_RESET) { 1554 _HandleReset(ECONNREFUSED); 1555 return DROP; 1556 } 1557 1558 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0) 1559 return DROP; 1560 1561 fSendUnacknowledged = segment.acknowledge; 1562 _PrepareReceivePath(segment); 1563 1564 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) { 1565 _MarkEstablished(); 1566 } else { 1567 // simultaneous open 1568 fState = SYNCHRONIZE_RECEIVED; 1569 T(State(this)); 1570 } 1571 1572 segment.flags &= ~TCP_FLAG_SYNCHRONIZE; 1573 // we handled this flag now, it must not be set for further processing 1574 1575 return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE; 1576 } 1577 1578 1579 int32 1580 TCPEndpoint::_Receive(tcp_segment_header& segment, net_buffer* buffer) 1581 { 1582 // PAWS processing takes precedence over regular TCP acceptability check 1583 if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0 && (segment.flags & TCP_FLAG_RESET) == 0) { 1584 if ((segment.options & TCP_HAS_TIMESTAMPS) == 0) 1585 return DROP; 1586 if ((int32)(fReceivedTimestamp - segment.timestamp_value) > 0 1587 && (fReceivedTimestamp - segment.timestamp_value) <= INT32_MAX) 1588 return DROP | IMMEDIATE_ACKNOWLEDGE; 1589 } 1590 1591 uint32 advertisedWindow = segment.AdvertisedWindow(fSendWindowShift); 1592 size_t segmentLength = buffer->size; 1593 1594 // First, handle the most common case for uni-directional data transfer 1595 // (known as header prediction - the segment must not change the window, 1596 // and must be the expected sequence, and contain no control flags) 1597 1598 if (fState == ESTABLISHED 1599 && segment.AcknowledgeOnly() 1600 && fReceiveNext == segment.sequence 1601 && advertisedWindow > 0 && advertisedWindow == fSendWindow 1602 && fSendNext == fSendMax) { 1603 _UpdateTimestamps(segment, segmentLength); 1604 1605 if (segmentLength == 0) { 1606 // this is a pure acknowledge segment - we're on the sending end 1607 if (fSendUnacknowledged < segment.acknowledge 1608 && fSendMax >= segment.acknowledge) { 1609 _Acknowledged(segment); 1610 return DROP; 1611 } 1612 } else if (segment.acknowledge == fSendUnacknowledged 1613 && fReceiveQueue.IsContiguous() 1614 && fReceiveQueue.Free() >= segmentLength 1615 && (fFlags & FLAG_NO_RECEIVE) == 0) { 1616 if (_AddData(segment, buffer)) 1617 _NotifyReader(); 1618 1619 return KEEP | ((segment.flags & TCP_FLAG_PUSH) != 0 1620 ? IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE); 1621 } 1622 } 1623 1624 // The fast path was not applicable, so we continue with the standard 1625 // processing of the incoming segment 1626 1627 ASSERT(fState != SYNCHRONIZE_SENT && fState != LISTEN); 1628 1629 if (fState != CLOSED && fState != TIME_WAIT) { 1630 // Check sequence number 1631 if (!segment_in_sequence(segment, segmentLength, fReceiveNext, 1632 fReceiveWindow)) { 1633 TRACE(" Receive(): segment out of window, next: %" B_PRIu32 1634 " wnd: %" B_PRIu32, fReceiveNext.Number(), fReceiveWindow); 1635 if ((segment.flags & TCP_FLAG_RESET) != 0) { 1636 // TODO: this doesn't look right - review! 1637 return DROP; 1638 } 1639 return DROP | IMMEDIATE_ACKNOWLEDGE; 1640 } 1641 } 1642 1643 if ((segment.flags & TCP_FLAG_RESET) != 0) { 1644 // Is this a valid reset? 1645 // We generally ignore resets in time wait state (see RFC 1337) 1646 if (fLastAcknowledgeSent <= segment.sequence 1647 && tcp_sequence(segment.sequence) < (fLastAcknowledgeSent 1648 + fReceiveWindow) 1649 && fState != TIME_WAIT) { 1650 status_t error; 1651 if (fState == SYNCHRONIZE_RECEIVED) 1652 error = ECONNREFUSED; 1653 else if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE) 1654 error = ENOTCONN; 1655 else 1656 error = ECONNRESET; 1657 1658 _HandleReset(error); 1659 } 1660 1661 return DROP; 1662 } 1663 1664 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0 1665 || (fState == SYNCHRONIZE_RECEIVED 1666 && (fInitialReceiveSequence > segment.sequence 1667 || ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0 1668 && (fSendUnacknowledged > segment.acknowledge 1669 || fSendMax < segment.acknowledge))))) { 1670 // reset the connection - either the initial SYN was faulty, or we 1671 // received a SYN within the data stream 1672 return DROP | RESET; 1673 } 1674 1675 // TODO: Check this! Why do we advertize a window outside of what we should 1676 // buffer? 1677 fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow); 1678 // the window must not shrink 1679 1680 // trim buffer to be within the receive window 1681 int32 drop = (int32)(fReceiveNext - segment.sequence).Number(); 1682 if (drop > 0) { 1683 if ((uint32)drop > buffer->size 1684 || ((uint32)drop == buffer->size 1685 && (segment.flags & TCP_FLAG_FINISH) == 0)) { 1686 // don't accidently remove a FIN we shouldn't remove 1687 segment.flags &= ~TCP_FLAG_FINISH; 1688 drop = buffer->size; 1689 } 1690 1691 // remove duplicate data at the start 1692 TRACE("* remove %" B_PRId32 " bytes from the start", drop); 1693 gBufferModule->remove_header(buffer, drop); 1694 segment.sequence += drop; 1695 } 1696 1697 int32 action = KEEP; 1698 1699 // immediately acknowledge out-of-order segment to trigger fast-retransmit at the sender 1700 if (drop != 0) 1701 action |= IMMEDIATE_ACKNOWLEDGE; 1702 1703 drop = (int32)(segment.sequence + buffer->size 1704 - (fReceiveNext + fReceiveWindow)).Number(); 1705 if (drop > 0) { 1706 // remove data exceeding our window 1707 if ((uint32)drop >= buffer->size) { 1708 // if we can accept data, or the segment is not what we'd expect, 1709 // drop the segment (an immediate acknowledge is always triggered) 1710 if (fReceiveWindow != 0 || segment.sequence != fReceiveNext) 1711 return DROP | IMMEDIATE_ACKNOWLEDGE; 1712 1713 action |= IMMEDIATE_ACKNOWLEDGE; 1714 } 1715 1716 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1717 // we need to remove the finish, too, as part of the data 1718 drop--; 1719 } 1720 1721 segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH); 1722 TRACE("* remove %" B_PRId32 " bytes from the end", drop); 1723 gBufferModule->remove_trailer(buffer, drop); 1724 } 1725 1726 #ifdef TRACE_TCP 1727 if (advertisedWindow > fSendWindow) { 1728 TRACE(" Receive(): Window update %" B_PRIu32 " -> %" B_PRIu32, 1729 fSendWindow, advertisedWindow); 1730 } 1731 #endif 1732 1733 if (fSendWindow < fSendMaxSegmentSize 1734 && advertisedWindow >= fSendMaxSegmentSize) { 1735 // Our current send window is less than a segment wide, and the new one 1736 // is larger, so trigger a send in case there's anything to be sent. 1737 action |= SEND_QUEUED; 1738 } 1739 1740 fSendWindow = advertisedWindow; 1741 if (advertisedWindow > fSendMaxWindow) 1742 fSendMaxWindow = advertisedWindow; 1743 1744 // Then look at the acknowledgement for any updates 1745 1746 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) { 1747 // process acknowledged data 1748 if (fState == SYNCHRONIZE_RECEIVED) 1749 _MarkEstablished(); 1750 1751 if (fSendMax < segment.acknowledge) 1752 return DROP | IMMEDIATE_ACKNOWLEDGE; 1753 1754 if (segment.acknowledge == fSendUnacknowledged) { 1755 if (buffer->size == 0 && advertisedWindow == fSendWindow 1756 && (segment.flags & TCP_FLAG_FINISH) == 0 && fSendUnacknowledged != fSendMax) { 1757 TRACE("Receive(): duplicate ack!"); 1758 _DuplicateAcknowledge(segment); 1759 } 1760 } else if (segment.acknowledge < fSendUnacknowledged) { 1761 return DROP; 1762 } else { 1763 // this segment acknowledges in flight data 1764 1765 if (fDuplicateAcknowledgeCount >= 3) { 1766 // deflate the window. 1767 if (segment.acknowledge > fRecover) { 1768 uint32 flightSize = (fSendMax - fSendUnacknowledged).Number(); 1769 fCongestionWindow = min_c(fSlowStartThreshold, 1770 max_c(flightSize, fSendMaxSegmentSize) + fSendMaxSegmentSize); 1771 fFlags &= ~FLAG_RECOVERY; 1772 } 1773 } 1774 1775 if (fSendMax == segment.acknowledge) 1776 TRACE("Receive(): all inflight data ack'd!"); 1777 1778 if (segment.acknowledge > fSendQueue.LastSequence() 1779 && fState > ESTABLISHED) { 1780 TRACE("Receive(): FIN has been acknowledged!"); 1781 1782 switch (fState) { 1783 case FINISH_SENT: 1784 fState = FINISH_ACKNOWLEDGED; 1785 T(State(this)); 1786 break; 1787 case CLOSING: 1788 fState = TIME_WAIT; 1789 T(State(this)); 1790 _EnterTimeWait(); 1791 return DROP; 1792 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1793 _Close(); 1794 break; 1795 1796 default: 1797 break; 1798 } 1799 } 1800 1801 if (fState != CLOSED) { 1802 tcp_sequence last = fLastAcknowledgeSent; 1803 _Acknowledged(segment); 1804 // we just sent an acknowledge, remove from action 1805 if (last < fLastAcknowledgeSent) 1806 action &= ~IMMEDIATE_ACKNOWLEDGE; 1807 } 1808 } 1809 } 1810 1811 if (segment.flags & TCP_FLAG_URGENT) { 1812 if (fState == ESTABLISHED || fState == FINISH_SENT 1813 || fState == FINISH_ACKNOWLEDGED) { 1814 // TODO: Handle urgent data: 1815 // - RCV.UP <- max(RCV.UP, SEG.UP) 1816 // - signal the user that urgent data is available (SIGURG) 1817 } 1818 } 1819 1820 bool notify = false; 1821 1822 // The buffer may be freed if its data is added to the queue, so cache 1823 // the size as we still need it later. 1824 const uint32 bufferSize = buffer->size; 1825 1826 if ((bufferSize > 0 || (segment.flags & TCP_FLAG_FINISH) != 0) 1827 && _ShouldReceive()) 1828 notify = _AddData(segment, buffer); 1829 else { 1830 if ((fFlags & FLAG_NO_RECEIVE) != 0) 1831 fReceiveNext += buffer->size; 1832 1833 action = (action & ~KEEP) | DROP; 1834 } 1835 1836 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1837 segmentLength++; 1838 if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) { 1839 TRACE("Receive(): peer is finishing connection!"); 1840 fReceiveNext++; 1841 notify = true; 1842 1843 // FIN implies PUSH 1844 fReceiveQueue.SetPushPointer(); 1845 1846 // we'll reply immediately to the FIN if we are not 1847 // transitioning to TIME WAIT so we immediatly ACK it. 1848 action |= IMMEDIATE_ACKNOWLEDGE; 1849 1850 // other side is closing connection; change states 1851 switch (fState) { 1852 case ESTABLISHED: 1853 case SYNCHRONIZE_RECEIVED: 1854 fState = FINISH_RECEIVED; 1855 T(State(this)); 1856 break; 1857 case FINISH_SENT: 1858 // simultaneous close 1859 fState = CLOSING; 1860 T(State(this)); 1861 break; 1862 case FINISH_ACKNOWLEDGED: 1863 fState = TIME_WAIT; 1864 T(State(this)); 1865 _EnterTimeWait(); 1866 break; 1867 case TIME_WAIT: 1868 _UpdateTimeWait(); 1869 break; 1870 1871 default: 1872 break; 1873 } 1874 } 1875 } 1876 1877 if (notify) 1878 _NotifyReader(); 1879 1880 if (bufferSize > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0) 1881 action |= ACKNOWLEDGE; 1882 1883 _UpdateTimestamps(segment, segmentLength); 1884 1885 TRACE("Receive() Action %" B_PRId32, action); 1886 1887 return action; 1888 } 1889 1890 1891 int32 1892 TCPEndpoint::SegmentReceived(tcp_segment_header& segment, net_buffer* buffer) 1893 { 1894 MutexLocker locker(fLock); 1895 1896 TRACE("SegmentReceived(): buffer %p (%" B_PRIu32 " bytes) address %s " 1897 "to %s flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32 1898 ", wnd %" B_PRIu32, buffer, buffer->size, PrintAddress(buffer->source), 1899 PrintAddress(buffer->destination), segment.flags, segment.sequence, 1900 segment.acknowledge, 1901 (uint32)segment.advertised_window << fSendWindowShift); 1902 T(Receive(this, segment, 1903 (uint32)segment.advertised_window << fSendWindowShift, buffer)); 1904 int32 segmentAction = DROP; 1905 1906 switch (fState) { 1907 case LISTEN: 1908 segmentAction = _ListenReceive(segment, buffer); 1909 break; 1910 1911 case SYNCHRONIZE_SENT: 1912 segmentAction = _SynchronizeSentReceive(segment, buffer); 1913 break; 1914 1915 case SYNCHRONIZE_RECEIVED: 1916 case ESTABLISHED: 1917 case FINISH_RECEIVED: 1918 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1919 case FINISH_SENT: 1920 case FINISH_ACKNOWLEDGED: 1921 case CLOSING: 1922 case TIME_WAIT: 1923 case CLOSED: 1924 segmentAction = _Receive(segment, buffer); 1925 break; 1926 } 1927 1928 // process acknowledge action as asked for by the *Receive() method 1929 if (segmentAction & IMMEDIATE_ACKNOWLEDGE) 1930 _SendAcknowledge(true); 1931 else if (segmentAction & ACKNOWLEDGE) 1932 DelayedAcknowledge(); 1933 1934 if (segmentAction & SEND_QUEUED) 1935 _SendQueued(); 1936 1937 if ((fFlags & (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) 1938 == (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) { 1939 1940 locker.Unlock(); 1941 if (gSocketModule->release_socket(socket)) 1942 segmentAction |= DELETED_ENDPOINT; 1943 } 1944 1945 return segmentAction; 1946 } 1947 1948 1949 // #pragma mark - send 1950 1951 1952 tcp_segment_header 1953 TCPEndpoint::_PrepareSendSegment() 1954 { 1955 // we don't set FLAG_FINISH here, instead we do it 1956 // conditionally below depending if we are sending 1957 // the last bytes of the send queue. 1958 1959 uint8 flags = 0; 1960 switch (fState) { 1961 case CLOSED: 1962 flags = TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE; 1963 break; 1964 1965 case SYNCHRONIZE_SENT: 1966 flags = TCP_FLAG_SYNCHRONIZE; 1967 break; 1968 1969 case SYNCHRONIZE_RECEIVED: 1970 flags = TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE; 1971 break; 1972 1973 case ESTABLISHED: 1974 case FINISH_RECEIVED: 1975 case FINISH_ACKNOWLEDGED: 1976 case TIME_WAIT: 1977 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1978 case FINISH_SENT: 1979 case CLOSING: 1980 flags = TCP_FLAG_ACKNOWLEDGE; 1981 1982 default: 1983 break; 1984 } 1985 1986 tcp_segment_header segment(flags); 1987 1988 if ((fOptions & TCP_NOOPT) == 0) { 1989 if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0) { 1990 segment.options |= TCP_HAS_TIMESTAMPS; 1991 segment.timestamp_reply = fReceivedTimestamp; 1992 segment.timestamp_value = tcp_now(); 1993 } 1994 1995 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0 1996 && fSendNext == fInitialSendSequence) { 1997 // add connection establishment options 1998 segment.max_segment_size = fReceiveMaxSegmentSize; 1999 if (fFlags & FLAG_OPTION_WINDOW_SCALE) { 2000 segment.options |= TCP_HAS_WINDOW_SCALE; 2001 segment.window_shift = fReceiveWindowShift; 2002 } 2003 if ((fFlags & FLAG_OPTION_SACK_PERMITTED) != 0) 2004 segment.options |= TCP_SACK_PERMITTED; 2005 } 2006 2007 if (!fReceiveQueue.IsContiguous() 2008 && (fFlags & FLAG_OPTION_SACK_PERMITTED) != 0) { 2009 segment.options |= TCP_HAS_SACK; 2010 int maxSackCount = MAX_SACK_BLKS 2011 - ((fFlags & FLAG_OPTION_TIMESTAMP) != 0) ? 1 : 0; 2012 memset(segment.sacks, 0, sizeof(segment.sacks)); 2013 segment.sackCount = fReceiveQueue.PopulateSackInfo(fReceiveNext, 2014 maxSackCount, segment.sacks); 2015 } 2016 } 2017 2018 size_t availableBytes = fReceiveQueue.Free(); 2019 // window size must remain same for duplicate acknowledgements 2020 if (!fReceiveQueue.IsContiguous()) 2021 availableBytes = (fReceiveMaxAdvertised - fReceiveNext).Number(); 2022 segment.SetAdvertisedWindow(availableBytes, fReceiveWindowShift); 2023 2024 segment.acknowledge = fReceiveNext.Number(); 2025 2026 // Process urgent data 2027 if (fSendUrgentOffset > fSendNext) { 2028 segment.flags |= TCP_FLAG_URGENT; 2029 segment.urgent_offset = (fSendUrgentOffset - fSendNext).Number(); 2030 } else { 2031 fSendUrgentOffset = fSendUnacknowledged.Number(); 2032 // Keep urgent offset updated, so that it doesn't reach into our 2033 // send window on overlap 2034 segment.urgent_offset = 0; 2035 } 2036 2037 return segment; 2038 } 2039 2040 2041 status_t 2042 TCPEndpoint::_PrepareAndSend(tcp_segment_header& segment, net_buffer* buffer, 2043 bool isRetransmit) 2044 { 2045 LocalAddress().CopyTo(buffer->source); 2046 PeerAddress().CopyTo(buffer->destination); 2047 2048 uint32 size = buffer->size, segmentLength = size; 2049 segment.sequence = fSendNext.Number(); 2050 2051 TRACE("_PrepareAndSend(): buffer %p (%" B_PRIu32 " bytes) address %s to " 2052 "%s flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32 2053 ", rwnd %" B_PRIu16 ", cwnd %" B_PRIu32 ", ssthresh %" B_PRIu32 2054 ", len %" B_PRIu32 ", first %" B_PRIu32 ", last %" B_PRIu32, 2055 buffer, buffer->size, PrintAddress(buffer->source), 2056 PrintAddress(buffer->destination), segment.flags, segment.sequence, 2057 segment.acknowledge, segment.advertised_window, 2058 fCongestionWindow, fSlowStartThreshold, segmentLength, 2059 fSendQueue.FirstSequence().Number(), 2060 fSendQueue.LastSequence().Number()); 2061 T(Send(this, segment, buffer, fSendQueue.FirstSequence(), 2062 fSendQueue.LastSequence())); 2063 2064 PROBE(buffer, sendWindow); 2065 2066 status_t status = add_tcp_header(AddressModule(), segment, buffer); 2067 if (status != B_OK) { 2068 gBufferModule->free(buffer); 2069 return status; 2070 } 2071 2072 if (segment.flags & TCP_FLAG_SYNCHRONIZE) { 2073 segment.options &= ~TCP_HAS_WINDOW_SCALE; 2074 segment.max_segment_size = 0; 2075 size++; 2076 } 2077 2078 if (segment.flags & TCP_FLAG_FINISH) 2079 size++; 2080 2081 status = next->module->send_routed_data(next, fRoute, buffer); 2082 if (status < B_OK) { 2083 gBufferModule->free(buffer); 2084 return status; 2085 } 2086 2087 fSendNext += size; 2088 if (fSendMax < fSendNext) 2089 fSendMax = fSendNext; 2090 2091 fReceiveMaxAdvertised = fReceiveNext + segment.AdvertisedWindow(fReceiveWindowShift); 2092 2093 if (segmentLength != 0 && fState == ESTABLISHED) 2094 --fSendMaxSegments; 2095 2096 if (fSendTime == 0 && !isRetransmit 2097 && (segmentLength != 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0)) { 2098 fSendTime = tcp_now(); 2099 fRoundTripStartSequence = segment.sequence; 2100 } 2101 2102 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) { 2103 fLastAcknowledgeSent = segment.acknowledge; 2104 gStackModule->cancel_timer(&fDelayedAcknowledgeTimer); 2105 } 2106 2107 return B_OK; 2108 } 2109 2110 2111 inline bool 2112 TCPEndpoint::_ShouldSendSegment(tcp_segment_header& segment, uint32 length, 2113 uint32 segmentMaxSize, uint32 flightSize) 2114 { 2115 if (fState == ESTABLISHED && fSendMaxSegments == 0) 2116 return false; 2117 2118 if (length > 0) { 2119 // Avoid the silly window syndrome - we only send a segment in case: 2120 // - we have a full segment to send, or 2121 // - we're at the end of our buffer queue, or 2122 // - the buffer is at least larger than half of the maximum send window, 2123 // or 2124 // - we're retransmitting data 2125 if (length == segmentMaxSize 2126 || (fOptions & TCP_NODELAY) != 0 2127 || tcp_sequence(fSendNext + length) == fSendQueue.LastSequence() 2128 || (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2)) 2129 return true; 2130 } 2131 2132 // check if we need to send a window update to the peer 2133 if (segment.advertised_window > 0) { 2134 // correct the window to take into account what already has been advertised 2135 uint32 window = segment.AdvertisedWindow(fReceiveWindowShift) 2136 - (fReceiveMaxAdvertised - fReceiveNext).Number(); 2137 2138 // if we can advertise a window larger than twice the maximum segment 2139 // size, or half the maximum buffer size we send a window update 2140 if (window >= (fReceiveMaxSegmentSize << 1) 2141 || window >= (socket->receive.buffer_size >> 1)) 2142 return true; 2143 } 2144 2145 if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH 2146 | TCP_FLAG_RESET)) != 0) 2147 return true; 2148 2149 // We do have urgent data pending 2150 if (fSendUrgentOffset > fSendNext) 2151 return true; 2152 2153 // there is no reason to send a segment just now 2154 return false; 2155 } 2156 2157 2158 status_t 2159 TCPEndpoint::_SendAcknowledge(bool force) 2160 { 2161 if (fRoute == NULL || fState == LISTEN) 2162 return B_ERROR; 2163 2164 tcp_segment_header segment = _PrepareSendSegment(); 2165 2166 // Is there actually anything to do? 2167 if (!force && fState == ESTABLISHED 2168 && fLastAcknowledgeSent == fReceiveNext 2169 && fReceiveQueue.IsContiguous() 2170 && !_ShouldSendSegment(segment, 0, 0, 0)) 2171 return B_OK; 2172 2173 net_buffer* buffer = gBufferModule->create(256); 2174 if (buffer == NULL) 2175 return B_NO_MEMORY; 2176 2177 return _PrepareAndSend(segment, buffer, false); 2178 } 2179 2180 2181 /*! Sends one or more TCP segments with the data waiting in the queue. */ 2182 status_t 2183 TCPEndpoint::_SendQueued(bool force) 2184 { 2185 if (fRoute == NULL || fState < ESTABLISHED) 2186 return B_ERROR; 2187 2188 tcp_segment_header segment = _PrepareSendSegment(); 2189 2190 uint32 sendWindow = fSendWindow; 2191 if (fCongestionWindow > 0 && fCongestionWindow < sendWindow) 2192 sendWindow = fCongestionWindow; 2193 2194 // fSendUnacknowledged 2195 // | fSendNext fSendMax 2196 // | | | 2197 // v v v 2198 // ----------------------------------- 2199 // | effective window | 2200 // ----------------------------------- 2201 2202 // Flight size represents the window of data which is currently in the 2203 // ether. We should never send data such as the flight size becomes larger 2204 // than the effective window. Note however that the effective window may be 2205 // reduced (by congestion for instance), so at some point in time flight 2206 // size may be larger than the currently calculated window. 2207 2208 uint32 flightSize = (fSendMax - fSendUnacknowledged).Number(); 2209 uint32 consumedWindow = (fSendNext - fSendUnacknowledged).Number(); 2210 2211 if (consumedWindow > sendWindow) { 2212 sendWindow = 0; 2213 // TODO: enter persist state? try to get a window update. 2214 } else 2215 sendWindow -= consumedWindow; 2216 2217 uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow); 2218 if (length == 0 && !state_needs_finish(fState)) { 2219 // Nothing to send. 2220 return B_OK; 2221 } 2222 2223 bool shouldStartRetransmitTimer = fSendNext == fSendUnacknowledged; 2224 bool retransmit = fSendNext < fSendMax; 2225 2226 if (fDuplicateAcknowledgeCount != 0) { 2227 // send at most 1 SMSS of data when under limited transmit, fast transmit/recovery 2228 length = min_c(length, fSendMaxSegmentSize); 2229 } 2230 2231 do { 2232 uint32 segmentMaxSize = fSendMaxSegmentSize 2233 - tcp_options_length(segment); 2234 uint32 segmentLength = min_c(length, segmentMaxSize); 2235 2236 if ((fSendNext + segmentLength) == fSendQueue.LastSequence() && !force) { 2237 if (state_needs_finish(fState)) 2238 segment.flags |= TCP_FLAG_FINISH; 2239 if (length > 0) 2240 segment.flags |= TCP_FLAG_PUSH; 2241 } 2242 2243 // Determine if we should really send this segment 2244 if (!force && !retransmit && !_ShouldSendSegment(segment, segmentLength, 2245 segmentMaxSize, flightSize)) { 2246 if (fSendQueue.Available() 2247 && !gStackModule->is_timer_active(&fPersistTimer) 2248 && !gStackModule->is_timer_active(&fRetransmitTimer)) 2249 _StartPersistTimer(); 2250 break; 2251 } 2252 2253 net_buffer *buffer = gBufferModule->create(256); 2254 if (buffer == NULL) 2255 return B_NO_MEMORY; 2256 2257 status_t status = B_OK; 2258 if (segmentLength > 0) 2259 status = fSendQueue.Get(buffer, fSendNext, segmentLength); 2260 if (status < B_OK) { 2261 gBufferModule->free(buffer); 2262 return status; 2263 } 2264 2265 sendWindow -= buffer->size; 2266 2267 status = _PrepareAndSend(segment, buffer, retransmit); 2268 if (status != B_OK) 2269 return status; 2270 2271 if (shouldStartRetransmitTimer) { 2272 TRACE("starting initial retransmit timer of: %" B_PRIdBIGTIME, 2273 fRetransmitTimeout); 2274 gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout); 2275 T(TimerSet(this, "retransmit", fRetransmitTimeout)); 2276 shouldStartRetransmitTimer = false; 2277 } 2278 2279 length -= segmentLength; 2280 segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET 2281 | TCP_FLAG_FINISH); 2282 2283 if (retransmit) 2284 break; 2285 2286 } while (length > 0); 2287 2288 return B_OK; 2289 } 2290 2291 2292 int 2293 TCPEndpoint::_MaxSegmentSize(const sockaddr* address) const 2294 { 2295 return next->module->get_mtu(next, address) - sizeof(tcp_header); 2296 } 2297 2298 2299 status_t 2300 TCPEndpoint::_PrepareSendPath(const sockaddr* peer) 2301 { 2302 if (fRoute == NULL) { 2303 fRoute = gDatalinkModule->get_route(Domain(), peer); 2304 if (fRoute == NULL) 2305 return ENETUNREACH; 2306 2307 if ((fRoute->flags & RTF_LOCAL) != 0) 2308 fFlags |= FLAG_LOCAL; 2309 } 2310 2311 // make sure connection does not already exist 2312 status_t status = fManager->SetConnection(this, *LocalAddress(), peer, 2313 fRoute->interface_address->local); 2314 if (status < B_OK) 2315 return status; 2316 2317 fInitialSendSequence = system_time() >> 4; 2318 fSendNext = fInitialSendSequence; 2319 fSendUnacknowledged = fInitialSendSequence; 2320 fSendMax = fInitialSendSequence; 2321 fSendUrgentOffset = fInitialSendSequence; 2322 fRecover = fInitialSendSequence.Number(); 2323 2324 // we are counting the SYN here 2325 fSendQueue.SetInitialSequence(fSendNext + 1); 2326 2327 fReceiveMaxSegmentSize = _MaxSegmentSize(peer); 2328 2329 // Compute the window shift we advertise to our peer - if it doesn't support 2330 // this option, this will be reset to 0 (when its SYN is received) 2331 fReceiveWindowShift = 0; 2332 while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT 2333 && (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) { 2334 fReceiveWindowShift++; 2335 } 2336 2337 return B_OK; 2338 } 2339 2340 2341 void 2342 TCPEndpoint::_Acknowledged(tcp_segment_header& segment) 2343 { 2344 TRACE("_Acknowledged(): ack %" B_PRIu32 "; uack %" B_PRIu32 "; next %" 2345 B_PRIu32 "; max %" B_PRIu32, segment.acknowledge, 2346 fSendUnacknowledged.Number(), fSendNext.Number(), fSendMax.Number()); 2347 2348 ASSERT(fSendUnacknowledged <= segment.acknowledge); 2349 2350 if (fSendUnacknowledged < segment.acknowledge) { 2351 fSendQueue.RemoveUntil(segment.acknowledge); 2352 2353 uint32 bytesAcknowledged = segment.acknowledge - fSendUnacknowledged.Number(); 2354 fPreviousHighestAcknowledge = fSendUnacknowledged; 2355 fSendUnacknowledged = segment.acknowledge; 2356 uint32 flightSize = (fSendMax - fSendUnacknowledged).Number(); 2357 int32 expectedSamples = flightSize / (fSendMaxSegmentSize << 1); 2358 2359 if (fPreviousHighestAcknowledge > fSendUnacknowledged) { 2360 // need to update the recover variable upon a sequence wraparound 2361 fRecover = segment.acknowledge - 1; 2362 } 2363 2364 // the acknowledgment of the SYN/ACK MUST NOT increase the size of the congestion window 2365 if (fSendUnacknowledged != fInitialSendSequence) { 2366 if (fCongestionWindow < fSlowStartThreshold) 2367 fCongestionWindow += min_c(bytesAcknowledged, fSendMaxSegmentSize); 2368 else { 2369 uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize; 2370 2371 if (increment < fCongestionWindow) 2372 increment = 1; 2373 else 2374 increment /= fCongestionWindow; 2375 2376 fCongestionWindow += increment; 2377 } 2378 2379 fSendMaxSegments = UINT32_MAX; 2380 } 2381 2382 if ((fFlags & FLAG_RECOVERY) != 0) { 2383 fSendNext = fSendUnacknowledged; 2384 _SendQueued(); 2385 fCongestionWindow -= bytesAcknowledged; 2386 2387 if (bytesAcknowledged > fSendMaxSegmentSize) 2388 fCongestionWindow += fSendMaxSegmentSize; 2389 2390 fSendNext = fSendMax; 2391 } else 2392 fDuplicateAcknowledgeCount = 0; 2393 2394 if (fSendNext < fSendUnacknowledged) 2395 fSendNext = fSendUnacknowledged; 2396 2397 if (fFlags & FLAG_OPTION_TIMESTAMP) { 2398 _UpdateRoundTripTime(tcp_diff_timestamp(segment.timestamp_reply), 2399 expectedSamples > 0 ? expectedSamples : 1); 2400 } else if (fSendTime != 0 && fRoundTripStartSequence < segment.acknowledge) { 2401 _UpdateRoundTripTime(tcp_diff_timestamp(fSendTime), 1); 2402 fSendTime = 0; 2403 } 2404 2405 if (fSendUnacknowledged == fSendMax) { 2406 TRACE("all acknowledged, cancelling retransmission timer."); 2407 gStackModule->cancel_timer(&fRetransmitTimer); 2408 T(TimerSet(this, "retransmit", -1)); 2409 } else { 2410 TRACE("data acknowledged, resetting retransmission timer to: %" 2411 B_PRIdBIGTIME, fRetransmitTimeout); 2412 gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout); 2413 T(TimerSet(this, "retransmit", fRetransmitTimeout)); 2414 } 2415 2416 if (is_writable(fState)) { 2417 // notify threads waiting on the socket to become writable again 2418 fSendCondition.NotifyAll(); 2419 gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Free()); 2420 } 2421 } 2422 2423 // if there is data left to be sent, send it now 2424 if (fSendQueue.Used() > 0) 2425 _SendQueued(); 2426 } 2427 2428 2429 void 2430 TCPEndpoint::_Retransmit() 2431 { 2432 TRACE("Retransmit()"); 2433 2434 if (fState < ESTABLISHED) { 2435 fRetransmitTimeout = TCP_SYN_RETRANSMIT_TIMEOUT; 2436 fCongestionWindow = fSendMaxSegmentSize; 2437 } else { 2438 _ResetSlowStart(); 2439 fDuplicateAcknowledgeCount = 0; 2440 // Do exponential back off of the retransmit timeout 2441 fRetransmitTimeout *= 2; 2442 if (fRetransmitTimeout > TCP_MAX_RETRANSMIT_TIMEOUT) 2443 fRetransmitTimeout = TCP_MAX_RETRANSMIT_TIMEOUT; 2444 } 2445 2446 fSendNext = fSendUnacknowledged; 2447 _SendQueued(); 2448 2449 fRecover = fSendNext.Number() - 1; 2450 if ((fFlags & FLAG_RECOVERY) != 0) 2451 fFlags &= ~FLAG_RECOVERY; 2452 } 2453 2454 2455 void 2456 TCPEndpoint::_UpdateRoundTripTime(int32 roundTripTime, int32 expectedSamples) 2457 { 2458 if (fSmoothedRoundTripTime == 0) { 2459 fSmoothedRoundTripTime = roundTripTime; 2460 fRoundTripVariation = roundTripTime / 2; 2461 fRetransmitTimeout = (fSmoothedRoundTripTime + max_c(100, fRoundTripVariation * 4)) 2462 * kTimestampFactor; 2463 } else { 2464 int32 delta = fSmoothedRoundTripTime - roundTripTime; 2465 if (delta < 0) 2466 delta = -delta; 2467 fRoundTripVariation += (delta - fRoundTripVariation) / (expectedSamples * 4); 2468 fSmoothedRoundTripTime += (roundTripTime - fSmoothedRoundTripTime) / (expectedSamples * 8); 2469 fRetransmitTimeout = (fSmoothedRoundTripTime + max_c(100, fRoundTripVariation * 4)) 2470 * kTimestampFactor; 2471 } 2472 2473 if (fRetransmitTimeout > TCP_MAX_RETRANSMIT_TIMEOUT) 2474 fRetransmitTimeout = TCP_MAX_RETRANSMIT_TIMEOUT; 2475 2476 if (fRetransmitTimeout < TCP_MIN_RETRANSMIT_TIMEOUT) 2477 fRetransmitTimeout = TCP_MIN_RETRANSMIT_TIMEOUT; 2478 2479 TRACE(" RTO is now %" B_PRIdBIGTIME " (after rtt %" B_PRId32 "ms)", 2480 fRetransmitTimeout, roundTripTime); 2481 } 2482 2483 2484 void 2485 TCPEndpoint::_ResetSlowStart() 2486 { 2487 fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged).Number() / 2, 2488 2 * fSendMaxSegmentSize); 2489 fCongestionWindow = fSendMaxSegmentSize; 2490 } 2491 2492 2493 // #pragma mark - timer 2494 2495 2496 /*static*/ void 2497 TCPEndpoint::_RetransmitTimer(net_timer* timer, void* _endpoint) 2498 { 2499 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2500 T(TimerTriggered(endpoint, "retransmit")); 2501 2502 MutexLocker locker(endpoint->fLock); 2503 if (!locker.IsLocked() || gStackModule->is_timer_active(timer)) 2504 return; 2505 2506 endpoint->_Retransmit(); 2507 } 2508 2509 2510 /*static*/ void 2511 TCPEndpoint::_PersistTimer(net_timer* timer, void* _endpoint) 2512 { 2513 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2514 T(TimerTriggered(endpoint, "persist")); 2515 2516 MutexLocker locker(endpoint->fLock); 2517 if (!locker.IsLocked()) 2518 return; 2519 2520 // the timer might not have been canceled early enough 2521 if (endpoint->State() == CLOSED) 2522 return; 2523 2524 endpoint->_SendQueued(true); 2525 } 2526 2527 2528 /*static*/ void 2529 TCPEndpoint::_DelayedAcknowledgeTimer(net_timer* timer, void* _endpoint) 2530 { 2531 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2532 T(TimerTriggered(endpoint, "delayed ack")); 2533 2534 MutexLocker locker(endpoint->fLock); 2535 if (!locker.IsLocked()) 2536 return; 2537 2538 // the timer might not have been canceled early enough 2539 if (endpoint->State() == CLOSED) 2540 return; 2541 2542 endpoint->_SendAcknowledge(); 2543 } 2544 2545 2546 /*static*/ void 2547 TCPEndpoint::_TimeWaitTimer(net_timer* timer, void* _endpoint) 2548 { 2549 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2550 T(TimerTriggered(endpoint, "time-wait")); 2551 2552 MutexLocker locker(endpoint->fLock); 2553 if (!locker.IsLocked()) 2554 return; 2555 2556 if ((endpoint->fFlags & FLAG_CLOSED) == 0) { 2557 endpoint->fFlags |= FLAG_DELETE_ON_CLOSE; 2558 return; 2559 } 2560 2561 locker.Unlock(); 2562 2563 gSocketModule->release_socket(endpoint->socket); 2564 } 2565 2566 2567 /*static*/ status_t 2568 TCPEndpoint::_WaitForCondition(ConditionVariable& condition, 2569 MutexLocker& locker, bigtime_t timeout) 2570 { 2571 ConditionVariableEntry entry; 2572 condition.Add(&entry); 2573 2574 locker.Unlock(); 2575 status_t result = entry.Wait(B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT, timeout); 2576 locker.Lock(); 2577 2578 return result; 2579 } 2580 2581 2582 // #pragma mark - 2583 2584 2585 void 2586 TCPEndpoint::Dump() const 2587 { 2588 kprintf("TCP endpoint %p\n", this); 2589 kprintf(" state: %s\n", name_for_state(fState)); 2590 kprintf(" flags: 0x%" B_PRIx32 "\n", fFlags); 2591 #if KDEBUG 2592 kprintf(" lock: { %p, holder: %" B_PRId32 " }\n", &fLock, fLock.holder); 2593 #endif 2594 kprintf(" accept sem: %" B_PRId32 "\n", fAcceptSemaphore); 2595 kprintf(" options: 0x%" B_PRIx32 "\n", (uint32)fOptions); 2596 kprintf(" send\n"); 2597 kprintf(" window shift: %" B_PRIu8 "\n", fSendWindowShift); 2598 kprintf(" unacknowledged: %" B_PRIu32 "\n", 2599 fSendUnacknowledged.Number()); 2600 kprintf(" next: %" B_PRIu32 "\n", fSendNext.Number()); 2601 kprintf(" max: %" B_PRIu32 "\n", fSendMax.Number()); 2602 kprintf(" urgent offset: %" B_PRIu32 "\n", fSendUrgentOffset.Number()); 2603 kprintf(" window: %" B_PRIu32 "\n", fSendWindow); 2604 kprintf(" max window: %" B_PRIu32 "\n", fSendMaxWindow); 2605 kprintf(" max segment size: %" B_PRIu32 "\n", fSendMaxSegmentSize); 2606 kprintf(" queue: %" B_PRIuSIZE " / %" B_PRIuSIZE "\n", fSendQueue.Used(), 2607 fSendQueue.Size()); 2608 #if DEBUG_TCP_BUFFER_QUEUE 2609 fSendQueue.Dump(); 2610 #endif 2611 kprintf(" last acknowledge sent: %" B_PRIu32 "\n", 2612 fLastAcknowledgeSent.Number()); 2613 kprintf(" initial sequence: %" B_PRIu32 "\n", 2614 fInitialSendSequence.Number()); 2615 kprintf(" receive\n"); 2616 kprintf(" window shift: %" B_PRIu8 "\n", fReceiveWindowShift); 2617 kprintf(" next: %" B_PRIu32 "\n", fReceiveNext.Number()); 2618 kprintf(" max advertised: %" B_PRIu32 "\n", 2619 fReceiveMaxAdvertised.Number()); 2620 kprintf(" window: %" B_PRIu32 "\n", fReceiveWindow); 2621 kprintf(" max segment size: %" B_PRIu32 "\n", fReceiveMaxSegmentSize); 2622 kprintf(" queue: %" B_PRIuSIZE " / %" B_PRIuSIZE "\n", 2623 fReceiveQueue.Available(), fReceiveQueue.Size()); 2624 #if DEBUG_TCP_BUFFER_QUEUE 2625 fReceiveQueue.Dump(); 2626 #endif 2627 kprintf(" initial sequence: %" B_PRIu32 "\n", 2628 fInitialReceiveSequence.Number()); 2629 kprintf(" duplicate acknowledge count: %" B_PRIu32 "\n", 2630 fDuplicateAcknowledgeCount); 2631 kprintf(" smoothed round trip time: %" B_PRId32 " (deviation %" B_PRId32 ")\n", 2632 fSmoothedRoundTripTime, fRoundTripVariation); 2633 kprintf(" retransmit timeout: %" B_PRId64 "\n", fRetransmitTimeout); 2634 kprintf(" congestion window: %" B_PRIu32 "\n", fCongestionWindow); 2635 kprintf(" slow start threshold: %" B_PRIu32 "\n", fSlowStartThreshold); 2636 } 2637 2638