1 /* 2 * Copyright 2006-2010, Haiku, Inc. All Rights Reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Andrew Galante, haiku.galante@gmail.com 7 * Axel Dörfler, axeld@pinc-software.de 8 * Hugo Santos, hugosantos@gmail.com 9 */ 10 11 12 #include "TCPEndpoint.h" 13 14 #include <netinet/in.h> 15 #include <netinet/ip.h> 16 #include <netinet/tcp.h> 17 #include <new> 18 #include <signal.h> 19 #include <stdlib.h> 20 #include <string.h> 21 #include <stdint.h> 22 23 #include <KernelExport.h> 24 #include <Select.h> 25 26 #include <net_buffer.h> 27 #include <net_datalink.h> 28 #include <net_stat.h> 29 #include <NetBufferUtilities.h> 30 #include <NetUtilities.h> 31 32 #include <lock.h> 33 #include <tracing.h> 34 #include <util/AutoLock.h> 35 #include <util/list.h> 36 37 #include "EndpointManager.h" 38 39 40 // References: 41 // - RFC 793 - Transmission Control Protocol 42 // - RFC 813 - Window and Acknowledgement Strategy in TCP 43 // - RFC 1337 - TIME_WAIT Assassination Hazards in TCP 44 // 45 // Things this implementation currently doesn't implement: 46 // - TCP Slow Start, Congestion Avoidance, Fast Retransmit, and Fast Recovery, 47 // RFC 2001, RFC 2581, RFC 3042 48 // - NewReno Modification to TCP's Fast Recovery, RFC 2582 49 // - Explicit Congestion Notification (ECN), RFC 3168 50 // - SYN-Cache 51 // - SACK, Selective Acknowledgment - RFC 2018, RFC 2883, RFC 3517 52 // - Forward RTO-Recovery, RFC 4138 53 // - Time-Wait hash instead of keeping sockets alive 54 // 55 // Things incomplete in this implementation: 56 // - TCP Extensions for High Performance, RFC 1323 - RTTM, PAWS 57 58 #define PrintAddress(address) \ 59 AddressString(Domain(), address, true).Data() 60 61 //#define TRACE_TCP 62 //#define PROBE_TCP 63 64 #ifdef TRACE_TCP 65 // the space before ', ##args' is important in order for this to work with cpp 2.95 66 # define TRACE(format, args...) dprintf("%" B_PRId32 ": TCP [%" \ 67 B_PRIdBIGTIME "] %p (%12s) " format "\n", find_thread(NULL), \ 68 system_time(), this, name_for_state(fState) , ##args) 69 #else 70 # define TRACE(args...) do { } while (0) 71 #endif 72 73 #ifdef PROBE_TCP 74 # define PROBE(buffer, window) \ 75 dprintf("TCP PROBE %" B_PRIdBIGTIME " %s %s %" B_PRIu32 " snxt %" B_PRIu32 \ 76 " suna %" B_PRIu32 " cw %" B_PRIu32 " sst %" B_PRIu32 " win %" \ 77 B_PRIu32 " swin %" B_PRIu32 " smax-suna %" B_PRIu32 " savail %" \ 78 B_PRIuSIZE " sqused %" B_PRIuSIZE " rto %" B_PRIdBIGTIME "\n", \ 79 system_time(), PrintAddress(buffer->source), \ 80 PrintAddress(buffer->destination), buffer->size, fSendNext.Number(), \ 81 fSendUnacknowledged.Number(), fCongestionWindow, fSlowStartThreshold, \ 82 window, fSendWindow, (fSendMax - fSendUnacknowledged).Number(), \ 83 fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout) 84 #else 85 # define PROBE(buffer, window) do { } while (0) 86 #endif 87 88 #if TCP_TRACING 89 namespace TCPTracing { 90 91 class Receive : public AbstractTraceEntry { 92 public: 93 Receive(TCPEndpoint* endpoint, tcp_segment_header& segment, uint32 window, 94 net_buffer* buffer) 95 : 96 fEndpoint(endpoint), 97 fBuffer(buffer), 98 fBufferSize(buffer->size), 99 fSequence(segment.sequence), 100 fAcknowledge(segment.acknowledge), 101 fWindow(window), 102 fState(endpoint->State()), 103 fFlags(segment.flags) 104 { 105 Initialized(); 106 } 107 108 virtual void AddDump(TraceOutput& out) 109 { 110 out.Print("tcp:%p (%12s) receive buffer %p (%" B_PRIu32 " bytes), " 111 "flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32 112 ", wnd %" B_PRIu32, fEndpoint, name_for_state(fState), fBuffer, 113 fBufferSize, fFlags, fSequence, fAcknowledge, fWindow); 114 } 115 116 protected: 117 TCPEndpoint* fEndpoint; 118 net_buffer* fBuffer; 119 uint32 fBufferSize; 120 uint32 fSequence; 121 uint32 fAcknowledge; 122 uint32 fWindow; 123 tcp_state fState; 124 uint8 fFlags; 125 }; 126 127 class Send : public AbstractTraceEntry { 128 public: 129 Send(TCPEndpoint* endpoint, tcp_segment_header& segment, net_buffer* buffer, 130 tcp_sequence firstSequence, tcp_sequence lastSequence) 131 : 132 fEndpoint(endpoint), 133 fBuffer(buffer), 134 fBufferSize(buffer->size), 135 fSequence(segment.sequence), 136 fAcknowledge(segment.acknowledge), 137 fFirstSequence(firstSequence.Number()), 138 fLastSequence(lastSequence.Number()), 139 fState(endpoint->State()), 140 fFlags(segment.flags) 141 { 142 Initialized(); 143 } 144 145 virtual void AddDump(TraceOutput& out) 146 { 147 out.Print("tcp:%p (%12s) send buffer %p (%" B_PRIu32 " bytes), " 148 "flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32 149 ", first %" B_PRIu32 ", last %" B_PRIu32, fEndpoint, 150 name_for_state(fState), fBuffer, fBufferSize, fFlags, fSequence, 151 fAcknowledge, fFirstSequence, fLastSequence); 152 } 153 154 protected: 155 TCPEndpoint* fEndpoint; 156 net_buffer* fBuffer; 157 uint32 fBufferSize; 158 uint32 fSequence; 159 uint32 fAcknowledge; 160 uint32 fFirstSequence; 161 uint32 fLastSequence; 162 tcp_state fState; 163 uint8 fFlags; 164 }; 165 166 class State : public AbstractTraceEntry { 167 public: 168 State(TCPEndpoint* endpoint) 169 : 170 fEndpoint(endpoint), 171 fState(endpoint->State()) 172 { 173 Initialized(); 174 } 175 176 virtual void AddDump(TraceOutput& out) 177 { 178 out.Print("tcp:%p (%12s) state change", fEndpoint, 179 name_for_state(fState)); 180 } 181 182 protected: 183 TCPEndpoint* fEndpoint; 184 tcp_state fState; 185 }; 186 187 class Spawn : public AbstractTraceEntry { 188 public: 189 Spawn(TCPEndpoint* listeningEndpoint, TCPEndpoint* spawnedEndpoint) 190 : 191 fListeningEndpoint(listeningEndpoint), 192 fSpawnedEndpoint(spawnedEndpoint) 193 { 194 Initialized(); 195 } 196 197 virtual void AddDump(TraceOutput& out) 198 { 199 out.Print("tcp:%p spawns %p", fListeningEndpoint, fSpawnedEndpoint); 200 } 201 202 protected: 203 TCPEndpoint* fListeningEndpoint; 204 TCPEndpoint* fSpawnedEndpoint; 205 }; 206 207 class Error : public AbstractTraceEntry { 208 public: 209 Error(TCPEndpoint* endpoint, const char* error, int32 line) 210 : 211 fEndpoint(endpoint), 212 fLine(line), 213 fError(error), 214 fState(endpoint->State()) 215 { 216 Initialized(); 217 } 218 219 virtual void AddDump(TraceOutput& out) 220 { 221 out.Print("tcp:%p (%12s) error at line %" B_PRId32 ": %s", fEndpoint, 222 name_for_state(fState), fLine, fError); 223 } 224 225 protected: 226 TCPEndpoint* fEndpoint; 227 int32 fLine; 228 const char* fError; 229 tcp_state fState; 230 }; 231 232 class TimerSet : public AbstractTraceEntry { 233 public: 234 TimerSet(TCPEndpoint* endpoint, const char* which, bigtime_t timeout) 235 : 236 fEndpoint(endpoint), 237 fWhich(which), 238 fTimeout(timeout), 239 fState(endpoint->State()) 240 { 241 Initialized(); 242 } 243 244 virtual void AddDump(TraceOutput& out) 245 { 246 out.Print("tcp:%p (%12s) %s timer set to %" B_PRIdBIGTIME, fEndpoint, 247 name_for_state(fState), fWhich, fTimeout); 248 } 249 250 protected: 251 TCPEndpoint* fEndpoint; 252 const char* fWhich; 253 bigtime_t fTimeout; 254 tcp_state fState; 255 }; 256 257 class TimerTriggered : public AbstractTraceEntry { 258 public: 259 TimerTriggered(TCPEndpoint* endpoint, const char* which) 260 : 261 fEndpoint(endpoint), 262 fWhich(which), 263 fState(endpoint->State()) 264 { 265 Initialized(); 266 } 267 268 virtual void AddDump(TraceOutput& out) 269 { 270 out.Print("tcp:%p (%12s) %s timer triggered", fEndpoint, 271 name_for_state(fState), fWhich); 272 } 273 274 protected: 275 TCPEndpoint* fEndpoint; 276 const char* fWhich; 277 tcp_state fState; 278 }; 279 280 class APICall : public AbstractTraceEntry { 281 public: 282 APICall(TCPEndpoint* endpoint, const char* which) 283 : 284 fEndpoint(endpoint), 285 fWhich(which), 286 fState(endpoint->State()) 287 { 288 Initialized(); 289 } 290 291 virtual void AddDump(TraceOutput& out) 292 { 293 out.Print("tcp:%p (%12s) api call: %s", fEndpoint, 294 name_for_state(fState), fWhich); 295 } 296 297 protected: 298 TCPEndpoint* fEndpoint; 299 const char* fWhich; 300 tcp_state fState; 301 }; 302 303 } // namespace TCPTracing 304 305 # define T(x) new(std::nothrow) TCPTracing::x 306 #else 307 # define T(x) 308 #endif // TCP_TRACING 309 310 311 // constants for the fFlags field 312 enum { 313 FLAG_OPTION_WINDOW_SCALE = 0x01, 314 FLAG_OPTION_TIMESTAMP = 0x02, 315 // TODO: Should FLAG_NO_RECEIVE apply as well to received connections? 316 // That is, what is expected from accept() after a shutdown() 317 // is performed on a listen()ing socket. 318 FLAG_NO_RECEIVE = 0x04, 319 FLAG_CLOSED = 0x08, 320 FLAG_DELETE_ON_CLOSE = 0x10, 321 FLAG_LOCAL = 0x20, 322 FLAG_RECOVERY = 0x40, 323 FLAG_OPTION_SACK_PERMITTED = 0x80, 324 }; 325 326 327 static const int kTimestampFactor = 1000; 328 // conversion factor between usec system time and msec tcp time 329 330 331 static inline bigtime_t 332 absolute_timeout(bigtime_t timeout) 333 { 334 if (timeout == 0 || timeout == B_INFINITE_TIMEOUT) 335 return timeout; 336 337 return timeout + system_time(); 338 } 339 340 341 static inline status_t 342 posix_error(status_t error) 343 { 344 if (error == B_TIMED_OUT) 345 return B_WOULD_BLOCK; 346 347 return error; 348 } 349 350 351 static inline bool 352 in_window(const tcp_sequence& sequence, const tcp_sequence& receiveNext, 353 uint32 receiveWindow) 354 { 355 return sequence >= receiveNext && sequence < (receiveNext + receiveWindow); 356 } 357 358 359 static inline bool 360 segment_in_sequence(const tcp_segment_header& segment, int size, 361 const tcp_sequence& receiveNext, uint32 receiveWindow) 362 { 363 tcp_sequence sequence(segment.sequence); 364 if (size == 0) { 365 if (receiveWindow == 0) 366 return sequence == receiveNext; 367 return in_window(sequence, receiveNext, receiveWindow); 368 } else { 369 if (receiveWindow == 0) 370 return false; 371 return in_window(sequence, receiveNext, receiveWindow) 372 || in_window(sequence + size - 1, receiveNext, receiveWindow); 373 } 374 } 375 376 377 static inline bool 378 is_writable(tcp_state state) 379 { 380 return state == ESTABLISHED || state == FINISH_RECEIVED; 381 } 382 383 384 static inline bool 385 is_establishing(tcp_state state) 386 { 387 return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED; 388 } 389 390 391 static inline uint32 tcp_now() 392 { 393 return system_time() / kTimestampFactor; 394 } 395 396 397 static inline uint32 tcp_diff_timestamp(uint32 base) 398 { 399 uint32 now = tcp_now(); 400 401 if (now > base) 402 return now - base; 403 404 return now + UINT_MAX - base; 405 } 406 407 408 static inline bool 409 state_needs_finish(int32 state) 410 { 411 return state == WAIT_FOR_FINISH_ACKNOWLEDGE 412 || state == FINISH_SENT || state == CLOSING; 413 } 414 415 416 // #pragma mark - 417 418 419 TCPEndpoint::TCPEndpoint(net_socket* socket) 420 : 421 ProtocolSocket(socket), 422 fManager(NULL), 423 fOptions(0), 424 fSendWindowShift(0), 425 fReceiveWindowShift(0), 426 fSendUnacknowledged(0), 427 fSendNext(0), 428 fSendMax(0), 429 fSendUrgentOffset(0), 430 fSendWindow(0), 431 fSendMaxWindow(0), 432 fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE), 433 fSendMaxSegments(0), 434 fSendQueue(socket->send.buffer_size), 435 fInitialSendSequence(0), 436 fPreviousHighestAcknowledge(0), 437 fDuplicateAcknowledgeCount(0), 438 fPreviousFlightSize(0), 439 fRecover(0), 440 fRoute(NULL), 441 fReceiveNext(0), 442 fReceiveMaxAdvertised(0), 443 fReceiveWindow(socket->receive.buffer_size), 444 fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE), 445 fReceiveQueue(socket->receive.buffer_size), 446 fSmoothedRoundTripTime(0), 447 fRoundTripVariation(0), 448 fSendTime(0), 449 fRoundTripStartSequence(0), 450 fRetransmitTimeout(TCP_INITIAL_RTT), 451 fReceivedTimestamp(0), 452 fCongestionWindow(0), 453 fSlowStartThreshold(0), 454 fState(CLOSED), 455 fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP | FLAG_OPTION_SACK_PERMITTED) 456 { 457 // TODO: to be replaced with a real read/write locking strategy! 458 mutex_init(&fLock, "tcp lock"); 459 460 fReceiveCondition.Init(this, "tcp receive"); 461 fSendCondition.Init(this, "tcp send"); 462 463 gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this); 464 gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer, 465 this); 466 gStackModule->init_timer(&fDelayedAcknowledgeTimer, 467 TCPEndpoint::_DelayedAcknowledgeTimer, this); 468 gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer, 469 this); 470 471 T(APICall(this, "constructor")); 472 } 473 474 475 TCPEndpoint::~TCPEndpoint() 476 { 477 mutex_lock(&fLock); 478 479 T(APICall(this, "destructor")); 480 481 _CancelConnectionTimers(); 482 gStackModule->cancel_timer(&fTimeWaitTimer); 483 T(TimerSet(this, "time-wait", -1)); 484 485 if (fManager != NULL) { 486 fManager->Unbind(this); 487 put_endpoint_manager(fManager); 488 } 489 490 mutex_destroy(&fLock); 491 492 // we need to wait for all timers to return 493 gStackModule->wait_for_timer(&fRetransmitTimer); 494 gStackModule->wait_for_timer(&fPersistTimer); 495 gStackModule->wait_for_timer(&fDelayedAcknowledgeTimer); 496 gStackModule->wait_for_timer(&fTimeWaitTimer); 497 498 gDatalinkModule->put_route(Domain(), fRoute); 499 } 500 501 502 status_t 503 TCPEndpoint::InitCheck() const 504 { 505 return B_OK; 506 } 507 508 509 // #pragma mark - protocol API 510 511 512 status_t 513 TCPEndpoint::Open() 514 { 515 TRACE("Open()"); 516 T(APICall(this, "open")); 517 518 status_t status = ProtocolSocket::Open(); 519 if (status < B_OK) 520 return status; 521 522 fManager = get_endpoint_manager(Domain()); 523 if (fManager == NULL) 524 return EAFNOSUPPORT; 525 526 return B_OK; 527 } 528 529 530 status_t 531 TCPEndpoint::Close() 532 { 533 MutexLocker locker(fLock); 534 535 TRACE("Close()"); 536 T(APICall(this, "close")); 537 538 if (fState == LISTEN) 539 delete_sem(fAcceptSemaphore); 540 541 if (fState == SYNCHRONIZE_SENT || fState == LISTEN) { 542 // TODO: what about linger in case of SYNCHRONIZE_SENT? 543 fState = CLOSED; 544 T(State(this)); 545 return B_OK; 546 } 547 548 status_t status = _Disconnect(true); 549 if (status != B_OK) 550 return status; 551 552 if (socket->options & SO_LINGER) { 553 TRACE("Close(): Lingering for %i secs", socket->linger); 554 555 bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL); 556 557 while (fSendQueue.Used() > 0) { 558 status = _WaitForCondition(fSendCondition, locker, maximum); 559 if (status == B_TIMED_OUT || status == B_WOULD_BLOCK) 560 break; 561 else if (status < B_OK) 562 return status; 563 } 564 565 TRACE("Close(): after waiting, the SendQ was left with %" B_PRIuSIZE 566 " bytes.", fSendQueue.Used()); 567 } 568 return B_OK; 569 } 570 571 572 void 573 TCPEndpoint::Free() 574 { 575 MutexLocker _(fLock); 576 577 TRACE("Free()"); 578 T(APICall(this, "free")); 579 580 if (fState <= SYNCHRONIZE_SENT) 581 return; 582 583 // we are only interested in the timer, not in changing state 584 _EnterTimeWait(); 585 586 fFlags |= FLAG_CLOSED; 587 if ((fFlags & FLAG_DELETE_ON_CLOSE) == 0) { 588 // we'll be freed later when the 2MSL timer expires 589 gSocketModule->acquire_socket(socket); 590 } 591 } 592 593 594 /*! Creates and sends a synchronize packet to /a address, and then waits 595 until the connection has been established or refused. 596 */ 597 status_t 598 TCPEndpoint::Connect(const sockaddr* address) 599 { 600 if (!AddressModule()->is_same_family(address)) 601 return EAFNOSUPPORT; 602 603 MutexLocker locker(fLock); 604 605 TRACE("Connect() on address %s", PrintAddress(address)); 606 T(APICall(this, "connect")); 607 608 if (gStackModule->is_restarted_syscall()) { 609 bigtime_t timeout = gStackModule->restore_syscall_restart_timeout(); 610 status_t status = _WaitForEstablished(locker, timeout); 611 TRACE(" Connect(): Connection complete: %s (timeout was %" 612 B_PRIdBIGTIME ")", strerror(status), timeout); 613 return posix_error(status); 614 } 615 616 // Can only call connect() from CLOSED or LISTEN states 617 // otherwise endpoint is considered already connected 618 if (fState == LISTEN) { 619 // this socket is about to connect; remove pending connections in the backlog 620 gSocketModule->set_max_backlog(socket, 0); 621 } else if (fState == ESTABLISHED) { 622 return EISCONN; 623 } else if (fState != CLOSED) 624 return EALREADY; 625 626 // consider destination address INADDR_ANY as INADDR_LOOPBACK 627 sockaddr_storage _address; 628 if (AddressModule()->is_empty_address(address, false)) { 629 AddressModule()->get_loopback_address((sockaddr *)&_address); 630 // for IPv4 and IPv6 the port is at the same offset 631 ((sockaddr_in &)_address).sin_port = ((sockaddr_in *)address)->sin_port; 632 address = (sockaddr *)&_address; 633 } 634 635 status_t status = _PrepareSendPath(address); 636 if (status < B_OK) 637 return status; 638 639 TRACE(" Connect(): starting 3-way handshake..."); 640 641 fState = SYNCHRONIZE_SENT; 642 T(State(this)); 643 644 // send SYN 645 status = _SendQueued(); 646 if (status != B_OK) { 647 _Close(); 648 return status; 649 } 650 651 // If we are running over Loopback, after _SendQueued() returns we 652 // may be in ESTABLISHED already. 653 if (fState == ESTABLISHED) { 654 TRACE(" Connect() completed after _SendQueued()"); 655 return B_OK; 656 } 657 658 // wait until 3-way handshake is complete (if needed) 659 bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT); 660 if (timeout == 0) { 661 // we're a non-blocking socket 662 TRACE(" Connect() delayed, return EINPROGRESS"); 663 return EINPROGRESS; 664 } 665 666 bigtime_t absoluteTimeout = absolute_timeout(timeout); 667 gStackModule->store_syscall_restart_timeout(absoluteTimeout); 668 669 status = _WaitForEstablished(locker, absoluteTimeout); 670 TRACE(" Connect(): Connection complete: %s (timeout was %" B_PRIdBIGTIME 671 ")", strerror(status), timeout); 672 return posix_error(status); 673 } 674 675 676 status_t 677 TCPEndpoint::Accept(struct net_socket** _acceptedSocket) 678 { 679 MutexLocker locker(fLock); 680 681 TRACE("Accept()"); 682 T(APICall(this, "accept")); 683 684 status_t status; 685 bigtime_t timeout = absolute_timeout(socket->receive.timeout); 686 if (gStackModule->is_restarted_syscall()) 687 timeout = gStackModule->restore_syscall_restart_timeout(); 688 else 689 gStackModule->store_syscall_restart_timeout(timeout); 690 691 do { 692 locker.Unlock(); 693 694 status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT 695 | B_CAN_INTERRUPT, timeout); 696 if (status != B_OK) { 697 if (status == B_TIMED_OUT && socket->receive.timeout == 0) 698 return B_WOULD_BLOCK; 699 700 return status; 701 } 702 703 locker.Lock(); 704 status = gSocketModule->dequeue_connected(socket, _acceptedSocket); 705 #ifdef TRACE_TCP 706 if (status == B_OK) 707 TRACE(" Accept() returning %p", (*_acceptedSocket)->first_protocol); 708 #endif 709 } while (status != B_OK); 710 711 return status; 712 } 713 714 715 status_t 716 TCPEndpoint::Bind(const sockaddr *address) 717 { 718 if (address == NULL) 719 return B_BAD_VALUE; 720 721 MutexLocker lock(fLock); 722 723 TRACE("Bind() on address %s", PrintAddress(address)); 724 T(APICall(this, "bind")); 725 726 if (fState != CLOSED) 727 return EISCONN; 728 729 return fManager->Bind(this, address); 730 } 731 732 733 status_t 734 TCPEndpoint::Unbind(struct sockaddr *address) 735 { 736 MutexLocker _(fLock); 737 738 TRACE("Unbind()"); 739 T(APICall(this, "unbind")); 740 741 return fManager->Unbind(this); 742 } 743 744 745 status_t 746 TCPEndpoint::Listen(int count) 747 { 748 MutexLocker _(fLock); 749 750 TRACE("Listen()"); 751 T(APICall(this, "listen")); 752 753 if (fState != CLOSED && fState != LISTEN) 754 return B_BAD_VALUE; 755 756 if (fState == CLOSED) { 757 fAcceptSemaphore = create_sem(0, "tcp accept"); 758 if (fAcceptSemaphore < B_OK) 759 return ENOBUFS; 760 761 status_t status = fManager->SetPassive(this); 762 if (status != B_OK) { 763 delete_sem(fAcceptSemaphore); 764 fAcceptSemaphore = -1; 765 return status; 766 } 767 } 768 769 gSocketModule->set_max_backlog(socket, count); 770 771 fState = LISTEN; 772 T(State(this)); 773 return B_OK; 774 } 775 776 777 status_t 778 TCPEndpoint::Shutdown(int direction) 779 { 780 MutexLocker lock(fLock); 781 782 TRACE("Shutdown(%i)", direction); 783 T(APICall(this, "shutdown")); 784 785 if (direction == SHUT_RD || direction == SHUT_RDWR) 786 fFlags |= FLAG_NO_RECEIVE; 787 788 if (direction == SHUT_WR || direction == SHUT_RDWR) { 789 // TODO: That's not correct. After read/write shutting down the socket 790 // one should still be able to read previously arrived data. 791 _Disconnect(false); 792 } 793 794 return B_OK; 795 } 796 797 798 /*! Puts data contained in \a buffer into send buffer */ 799 status_t 800 TCPEndpoint::SendData(net_buffer *buffer) 801 { 802 MutexLocker lock(fLock); 803 804 TRACE("SendData(buffer %p, size %" B_PRIu32 ", flags %#" B_PRIx32 805 ") [total %" B_PRIuSIZE " bytes, has %" B_PRIuSIZE "]", buffer, 806 buffer->size, buffer->flags, fSendQueue.Size(), fSendQueue.Free()); 807 T(APICall(this, "senddata")); 808 809 uint32 flags = buffer->flags; 810 811 if (fState == CLOSED) 812 return ENOTCONN; 813 if (fState == LISTEN) 814 return EDESTADDRREQ; 815 if (!is_writable(fState) && !is_establishing(fState)) { 816 // we only send signals when called from userland 817 if (gStackModule->is_syscall() && (flags & MSG_NOSIGNAL) == 0) 818 send_signal(find_thread(NULL), SIGPIPE); 819 return EPIPE; 820 } 821 822 size_t left = buffer->size; 823 824 bigtime_t timeout = absolute_timeout(socket->send.timeout); 825 if (gStackModule->is_restarted_syscall()) 826 timeout = gStackModule->restore_syscall_restart_timeout(); 827 else 828 gStackModule->store_syscall_restart_timeout(timeout); 829 830 while (left > 0) { 831 while (fSendQueue.Free() < socket->send.low_water_mark) { 832 // wait until enough space is available 833 status_t status = _WaitForCondition(fSendCondition, lock, timeout); 834 if (status < B_OK) { 835 TRACE(" SendData() returning %s (%d)", 836 strerror(posix_error(status)), (int)posix_error(status)); 837 return posix_error(status); 838 } 839 840 if (!is_writable(fState) && !is_establishing(fState)) { 841 // we only send signals when called from userland 842 if (gStackModule->is_syscall()) 843 send_signal(find_thread(NULL), SIGPIPE); 844 return EPIPE; 845 } 846 } 847 848 size_t size = fSendQueue.Free(); 849 if (size < left) { 850 // we need to split the original buffer 851 net_buffer* clone = gBufferModule->clone(buffer, false); 852 // TODO: add offset/size parameter to net_buffer::clone() or 853 // even a move_data() function, as this is a bit inefficient 854 if (clone == NULL) 855 return ENOBUFS; 856 857 status_t status = gBufferModule->trim(clone, size); 858 if (status != B_OK) { 859 gBufferModule->free(clone); 860 return status; 861 } 862 863 gBufferModule->remove_header(buffer, size); 864 left -= size; 865 fSendQueue.Add(clone); 866 } else { 867 left -= buffer->size; 868 fSendQueue.Add(buffer); 869 } 870 } 871 872 TRACE(" SendData(): %" B_PRIuSIZE " bytes used.", fSendQueue.Used()); 873 874 bool force = false; 875 if ((flags & MSG_OOB) != 0) { 876 fSendUrgentOffset = fSendQueue.LastSequence(); 877 // RFC 961 specifies that the urgent offset points to the last 878 // byte of urgent data. However, this is commonly implemented as 879 // here, ie. it points to the first byte after the urgent data. 880 force = true; 881 } 882 if ((flags & MSG_EOF) != 0) 883 _Disconnect(false); 884 885 if (fState == ESTABLISHED || fState == FINISH_RECEIVED) 886 _SendQueued(force); 887 888 return B_OK; 889 } 890 891 892 ssize_t 893 TCPEndpoint::SendAvailable() 894 { 895 MutexLocker locker(fLock); 896 897 ssize_t available; 898 899 if (is_writable(fState)) 900 available = fSendQueue.Free(); 901 else if (is_establishing(fState)) 902 available = 0; 903 else 904 available = EPIPE; 905 906 TRACE("SendAvailable(): %" B_PRIdSSIZE, available); 907 T(APICall(this, "sendavailable")); 908 return available; 909 } 910 911 912 status_t 913 TCPEndpoint::FillStat(net_stat *stat) 914 { 915 MutexLocker _(fLock); 916 917 strlcpy(stat->state, name_for_state(fState), sizeof(stat->state)); 918 stat->receive_queue_size = fReceiveQueue.Available(); 919 stat->send_queue_size = fSendQueue.Used(); 920 921 return B_OK; 922 } 923 924 925 status_t 926 TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer) 927 { 928 MutexLocker locker(fLock); 929 930 TRACE("ReadData(%" B_PRIuSIZE " bytes, flags %#" B_PRIx32 ")", numBytes, 931 flags); 932 T(APICall(this, "readdata")); 933 934 *_buffer = NULL; 935 936 if (fState == CLOSED) 937 return ENOTCONN; 938 939 bigtime_t timeout = absolute_timeout(socket->receive.timeout); 940 if (gStackModule->is_restarted_syscall()) 941 timeout = gStackModule->restore_syscall_restart_timeout(); 942 else 943 gStackModule->store_syscall_restart_timeout(timeout); 944 945 if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) { 946 if (flags & MSG_DONTWAIT) 947 return B_WOULD_BLOCK; 948 949 status_t status = _WaitForEstablished(locker, timeout); 950 if (status < B_OK) 951 return posix_error(status); 952 } 953 954 size_t dataNeeded = socket->receive.low_water_mark; 955 956 // When MSG_WAITALL is set then the function should block 957 // until the full amount of data can be returned. 958 if (flags & MSG_WAITALL) 959 dataNeeded = numBytes; 960 961 // TODO: add support for urgent data (MSG_OOB) 962 963 while (true) { 964 if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE 965 || fState == TIME_WAIT) { 966 // ``Connection closing''. 967 return B_OK; 968 } 969 970 if (fReceiveQueue.Available() > 0) { 971 if (fReceiveQueue.Available() >= dataNeeded 972 || (fReceiveQueue.PushedData() > 0 973 && fReceiveQueue.PushedData() >= fReceiveQueue.Available())) 974 break; 975 } else if (fState == FINISH_RECEIVED) { 976 // ``If no text is awaiting delivery, the RECEIVE will 977 // get a Connection closing''. 978 return B_OK; 979 } 980 981 if ((flags & MSG_DONTWAIT) != 0 || socket->receive.timeout == 0) 982 return B_WOULD_BLOCK; 983 984 if ((fFlags & FLAG_NO_RECEIVE) != 0) 985 return B_OK; 986 987 status_t status = _WaitForCondition(fReceiveCondition, locker, timeout); 988 if (status < B_OK) { 989 // The Open Group base specification mentions that EINTR should be 990 // returned if the recv() is interrupted before _any data_ is 991 // available. So we actually check if there is data, and if so, 992 // push it to the user. 993 if ((status == B_TIMED_OUT || status == B_INTERRUPTED) 994 && fReceiveQueue.Available() > 0) 995 break; 996 997 return posix_error(status); 998 } 999 } 1000 1001 TRACE(" ReadData(): %" B_PRIuSIZE " are available.", 1002 fReceiveQueue.Available()); 1003 1004 if (numBytes < fReceiveQueue.Available()) 1005 fReceiveCondition.NotifyAll(); 1006 1007 bool clone = (flags & MSG_PEEK) != 0; 1008 1009 ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer); 1010 1011 TRACE(" ReadData(): %" B_PRIuSIZE " bytes kept.", 1012 fReceiveQueue.Available()); 1013 1014 if (fReceiveQueue.Available() == 0 && fState == FINISH_RECEIVED) 1015 socket->receive.low_water_mark = 0; 1016 1017 // if we are opening the window, check if we should send an ACK 1018 if (!clone) 1019 SendAcknowledge(false); 1020 1021 return receivedBytes; 1022 } 1023 1024 1025 ssize_t 1026 TCPEndpoint::ReadAvailable() 1027 { 1028 MutexLocker locker(fLock); 1029 1030 TRACE("ReadAvailable(): %" B_PRIdSSIZE, _AvailableData()); 1031 T(APICall(this, "readavailable")); 1032 1033 return _AvailableData(); 1034 } 1035 1036 1037 status_t 1038 TCPEndpoint::SetSendBufferSize(size_t length) 1039 { 1040 MutexLocker _(fLock); 1041 fSendQueue.SetMaxBytes(length); 1042 return B_OK; 1043 } 1044 1045 1046 status_t 1047 TCPEndpoint::SetReceiveBufferSize(size_t length) 1048 { 1049 MutexLocker _(fLock); 1050 fReceiveQueue.SetMaxBytes(length); 1051 return B_OK; 1052 } 1053 1054 1055 status_t 1056 TCPEndpoint::GetOption(int option, void* _value, int* _length) 1057 { 1058 if (*_length != sizeof(int)) 1059 return B_BAD_VALUE; 1060 1061 int* value = (int*)_value; 1062 1063 switch (option) { 1064 case TCP_NODELAY: 1065 if ((fOptions & TCP_NODELAY) != 0) 1066 *value = 1; 1067 else 1068 *value = 0; 1069 return B_OK; 1070 1071 case TCP_MAXSEG: 1072 *value = fReceiveMaxSegmentSize; 1073 return B_OK; 1074 1075 default: 1076 return B_BAD_VALUE; 1077 } 1078 } 1079 1080 1081 status_t 1082 TCPEndpoint::SetOption(int option, const void* _value, int length) 1083 { 1084 if (option != TCP_NODELAY) 1085 return B_BAD_VALUE; 1086 1087 if (length != sizeof(int)) 1088 return B_BAD_VALUE; 1089 1090 const int* value = (const int*)_value; 1091 1092 MutexLocker _(fLock); 1093 if (*value) 1094 fOptions |= TCP_NODELAY; 1095 else 1096 fOptions &= ~TCP_NODELAY; 1097 1098 return B_OK; 1099 } 1100 1101 1102 // #pragma mark - misc 1103 1104 1105 bool 1106 TCPEndpoint::IsBound() const 1107 { 1108 return !LocalAddress().IsEmpty(true); 1109 } 1110 1111 1112 bool 1113 TCPEndpoint::IsLocal() const 1114 { 1115 return (fFlags & FLAG_LOCAL) != 0; 1116 } 1117 1118 1119 status_t 1120 TCPEndpoint::DelayedAcknowledge() 1121 { 1122 if (gStackModule->cancel_timer(&fDelayedAcknowledgeTimer)) { 1123 // timer was active, send an ACK now (with the exception above, 1124 // we send every other ACK) 1125 T(TimerSet(this, "delayed ack", -1)); 1126 return SendAcknowledge(true); 1127 } 1128 1129 gStackModule->set_timer(&fDelayedAcknowledgeTimer, 1130 TCP_DELAYED_ACKNOWLEDGE_TIMEOUT); 1131 T(TimerSet(this, "delayed ack", TCP_DELAYED_ACKNOWLEDGE_TIMEOUT)); 1132 return B_OK; 1133 } 1134 1135 1136 status_t 1137 TCPEndpoint::SendAcknowledge(bool force) 1138 { 1139 return _SendQueued(force, 0); 1140 } 1141 1142 1143 void 1144 TCPEndpoint::_StartPersistTimer() 1145 { 1146 gStackModule->set_timer(&fPersistTimer, TCP_PERSIST_TIMEOUT); 1147 T(TimerSet(this, "persist", TCP_PERSIST_TIMEOUT)); 1148 } 1149 1150 1151 void 1152 TCPEndpoint::_EnterTimeWait() 1153 { 1154 TRACE("_EnterTimeWait()"); 1155 1156 if (fState == TIME_WAIT) { 1157 _CancelConnectionTimers(); 1158 } 1159 1160 _UpdateTimeWait(); 1161 } 1162 1163 1164 void 1165 TCPEndpoint::_UpdateTimeWait() 1166 { 1167 gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1); 1168 T(TimerSet(this, "time-wait", TCP_MAX_SEGMENT_LIFETIME << 1)); 1169 } 1170 1171 1172 void 1173 TCPEndpoint::_CancelConnectionTimers() 1174 { 1175 gStackModule->cancel_timer(&fRetransmitTimer); 1176 T(TimerSet(this, "retransmit", -1)); 1177 gStackModule->cancel_timer(&fPersistTimer); 1178 T(TimerSet(this, "persist", -1)); 1179 gStackModule->cancel_timer(&fDelayedAcknowledgeTimer); 1180 T(TimerSet(this, "delayed ack", -1)); 1181 } 1182 1183 1184 /*! Sends the FIN flag to the peer when the connection is still open. 1185 Moves the endpoint to the next state depending on where it was. 1186 */ 1187 status_t 1188 TCPEndpoint::_Disconnect(bool closing) 1189 { 1190 tcp_state previousState = fState; 1191 1192 if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED) 1193 fState = FINISH_SENT; 1194 else if (fState == FINISH_RECEIVED) 1195 fState = WAIT_FOR_FINISH_ACKNOWLEDGE; 1196 else 1197 return B_OK; 1198 1199 T(State(this)); 1200 1201 status_t status = _SendQueued(); 1202 if (status != B_OK) { 1203 fState = previousState; 1204 T(State(this)); 1205 return status; 1206 } 1207 1208 return B_OK; 1209 } 1210 1211 1212 void 1213 TCPEndpoint::_MarkEstablished() 1214 { 1215 fState = ESTABLISHED; 1216 T(State(this)); 1217 1218 gSocketModule->set_connected(socket); 1219 if (gSocketModule->has_parent(socket)) 1220 release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE); 1221 1222 fSendCondition.NotifyAll(); 1223 gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Free()); 1224 } 1225 1226 1227 status_t 1228 TCPEndpoint::_WaitForEstablished(MutexLocker &locker, bigtime_t timeout) 1229 { 1230 // TODO: Checking for CLOSED seems correct, but breaks several neon tests. 1231 // When investigating this, also have a look at _Close() and _HandleReset(). 1232 while (fState < ESTABLISHED/* && fState != CLOSED*/) { 1233 if (socket->error != B_OK) 1234 return socket->error; 1235 1236 status_t status = _WaitForCondition(fSendCondition, locker, timeout); 1237 if (status < B_OK) 1238 return status; 1239 } 1240 1241 return B_OK; 1242 } 1243 1244 1245 // #pragma mark - receive 1246 1247 1248 void 1249 TCPEndpoint::_Close() 1250 { 1251 _CancelConnectionTimers(); 1252 fState = CLOSED; 1253 T(State(this)); 1254 1255 fFlags |= FLAG_DELETE_ON_CLOSE; 1256 1257 fSendCondition.NotifyAll(); 1258 _NotifyReader(); 1259 1260 if (gSocketModule->has_parent(socket)) { 1261 // We still have a parent - obviously, we haven't been accepted yet, 1262 // so no one could ever close us. 1263 _CancelConnectionTimers(); 1264 gSocketModule->set_aborted(socket); 1265 } 1266 } 1267 1268 1269 void 1270 TCPEndpoint::_HandleReset(status_t error) 1271 { 1272 socket->error = error; 1273 _Close(); 1274 1275 gSocketModule->notify(socket, B_SELECT_WRITE, error); 1276 gSocketModule->notify(socket, B_SELECT_ERROR, error); 1277 } 1278 1279 1280 void 1281 TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment) 1282 { 1283 if (fDuplicateAcknowledgeCount == 0) 1284 fPreviousFlightSize = (fSendMax - fSendUnacknowledged).Number(); 1285 1286 if (++fDuplicateAcknowledgeCount < 3) { 1287 if (fSendQueue.Available(fSendMax) != 0 && fSendWindow != 0) { 1288 fSendNext = fSendMax; 1289 fCongestionWindow += fDuplicateAcknowledgeCount * fSendMaxSegmentSize; 1290 _SendQueued(); 1291 TRACE("_DuplicateAcknowledge(): packet sent under limited transmit on receipt of dup ack"); 1292 fCongestionWindow -= fDuplicateAcknowledgeCount * fSendMaxSegmentSize; 1293 } 1294 } 1295 1296 if (fDuplicateAcknowledgeCount == 3) { 1297 if ((segment.acknowledge - 1) > fRecover || (fCongestionWindow > fSendMaxSegmentSize && 1298 (fSendUnacknowledged - fPreviousHighestAcknowledge) <= 4 * fSendMaxSegmentSize)) { 1299 fFlags |= FLAG_RECOVERY; 1300 fRecover = fSendMax.Number() - 1; 1301 fSlowStartThreshold = max_c(fPreviousFlightSize / 2, 2 * fSendMaxSegmentSize); 1302 fCongestionWindow = fSlowStartThreshold + 3 * fSendMaxSegmentSize; 1303 fSendNext = segment.acknowledge; 1304 _SendQueued(); 1305 TRACE("_DuplicateAcknowledge(): packet sent under fast restransmit on the receipt of 3rd dup ack"); 1306 } 1307 } else if (fDuplicateAcknowledgeCount > 3) { 1308 uint32 flightSize = (fSendMax - fSendUnacknowledged).Number(); 1309 if ((fDuplicateAcknowledgeCount - 3) * fSendMaxSegmentSize <= flightSize) 1310 fCongestionWindow += fSendMaxSegmentSize; 1311 if (fSendQueue.Available(fSendMax) != 0) { 1312 fSendNext = fSendMax; 1313 _SendQueued(); 1314 } 1315 } 1316 } 1317 1318 1319 void 1320 TCPEndpoint::_UpdateTimestamps(tcp_segment_header& segment, 1321 size_t segmentLength) 1322 { 1323 if (fFlags & FLAG_OPTION_TIMESTAMP) { 1324 tcp_sequence sequence(segment.sequence); 1325 1326 if (fLastAcknowledgeSent >= sequence 1327 && fLastAcknowledgeSent < (sequence + segmentLength)) 1328 fReceivedTimestamp = segment.timestamp_value; 1329 } 1330 } 1331 1332 1333 ssize_t 1334 TCPEndpoint::_AvailableData() const 1335 { 1336 // TODO: Refer to the FLAG_NO_RECEIVE comment above regarding 1337 // the application of FLAG_NO_RECEIVE in listen()ing 1338 // sockets. 1339 if (fState == LISTEN) 1340 return gSocketModule->count_connected(socket); 1341 if (fState == SYNCHRONIZE_SENT) 1342 return 0; 1343 1344 ssize_t availableData = fReceiveQueue.Available(); 1345 1346 if (availableData == 0 && !_ShouldReceive()) 1347 return ENOTCONN; 1348 1349 return availableData; 1350 } 1351 1352 1353 void 1354 TCPEndpoint::_NotifyReader() 1355 { 1356 fReceiveCondition.NotifyAll(); 1357 gSocketModule->notify(socket, B_SELECT_READ, _AvailableData()); 1358 } 1359 1360 1361 bool 1362 TCPEndpoint::_AddData(tcp_segment_header& segment, net_buffer* buffer) 1363 { 1364 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1365 // Remember the position of the finish received flag 1366 fFinishReceived = true; 1367 fFinishReceivedAt = segment.sequence + buffer->size; 1368 } 1369 1370 fReceiveQueue.Add(buffer, segment.sequence); 1371 fReceiveNext = fReceiveQueue.NextSequence(); 1372 1373 if (fFinishReceived) { 1374 // Set or reset the finish flag on the current segment 1375 if (fReceiveNext < fFinishReceivedAt) 1376 segment.flags &= ~TCP_FLAG_FINISH; 1377 else 1378 segment.flags |= TCP_FLAG_FINISH; 1379 } 1380 1381 TRACE(" _AddData(): adding data, receive next = %" B_PRIu32 ". Now have %" 1382 B_PRIuSIZE " bytes.", fReceiveNext.Number(), fReceiveQueue.Available()); 1383 1384 if ((segment.flags & TCP_FLAG_PUSH) != 0) 1385 fReceiveQueue.SetPushPointer(); 1386 1387 return fReceiveQueue.Available() > 0; 1388 } 1389 1390 1391 void 1392 TCPEndpoint::_PrepareReceivePath(tcp_segment_header& segment) 1393 { 1394 fInitialReceiveSequence = segment.sequence; 1395 fFinishReceived = false; 1396 1397 // count the received SYN 1398 segment.sequence++; 1399 1400 fReceiveNext = segment.sequence; 1401 fReceiveQueue.SetInitialSequence(segment.sequence); 1402 1403 if ((fOptions & TCP_NOOPT) == 0) { 1404 if (segment.max_segment_size > 0) { 1405 // The maximum size of a segment that a TCP endpoint really sends, 1406 // the "effective send MSS", MUST be the smaller of the send MSS and 1407 // the largest transmission size permitted by the IP layer: 1408 fSendMaxSegmentSize = min_c(segment.max_segment_size, 1409 _MaxSegmentSize(*PeerAddress())); 1410 } 1411 1412 if (segment.options & TCP_HAS_WINDOW_SCALE) { 1413 fFlags |= FLAG_OPTION_WINDOW_SCALE; 1414 fSendWindowShift = segment.window_shift; 1415 } else { 1416 fFlags &= ~FLAG_OPTION_WINDOW_SCALE; 1417 fReceiveWindowShift = 0; 1418 } 1419 1420 if (segment.options & TCP_HAS_TIMESTAMPS) { 1421 fFlags |= FLAG_OPTION_TIMESTAMP; 1422 fReceivedTimestamp = segment.timestamp_value; 1423 } else 1424 fFlags &= ~FLAG_OPTION_TIMESTAMP; 1425 1426 if ((segment.options & TCP_SACK_PERMITTED) == 0) 1427 fFlags &= ~FLAG_OPTION_SACK_PERMITTED; 1428 } 1429 1430 if (fSendMaxSegmentSize > 2190) 1431 fCongestionWindow = 2 * fSendMaxSegmentSize; 1432 else if (fSendMaxSegmentSize > 1095) 1433 fCongestionWindow = 3 * fSendMaxSegmentSize; 1434 else 1435 fCongestionWindow = 4 * fSendMaxSegmentSize; 1436 1437 fSendMaxSegments = fCongestionWindow / fSendMaxSegmentSize; 1438 fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift; 1439 } 1440 1441 1442 bool 1443 TCPEndpoint::_ShouldReceive() const 1444 { 1445 if ((fFlags & FLAG_NO_RECEIVE) != 0) 1446 return false; 1447 1448 return fState == ESTABLISHED || fState == FINISH_SENT 1449 || fState == FINISH_ACKNOWLEDGED || fState == FINISH_RECEIVED; 1450 } 1451 1452 1453 int32 1454 TCPEndpoint::_Spawn(TCPEndpoint* parent, tcp_segment_header& segment, 1455 net_buffer* buffer) 1456 { 1457 MutexLocker _(fLock); 1458 1459 // TODO error checking 1460 ProtocolSocket::Open(); 1461 1462 fState = SYNCHRONIZE_RECEIVED; 1463 T(Spawn(parent, this)); 1464 1465 fManager = parent->fManager; 1466 1467 LocalAddress().SetTo(buffer->destination); 1468 PeerAddress().SetTo(buffer->source); 1469 1470 TRACE("Spawn()"); 1471 1472 // TODO: proper error handling! 1473 if (fManager->BindChild(this) != B_OK) { 1474 T(Error(this, "binding failed", __LINE__)); 1475 return DROP; 1476 } 1477 if (_PrepareSendPath(*PeerAddress()) != B_OK) { 1478 T(Error(this, "prepare send faild", __LINE__)); 1479 return DROP; 1480 } 1481 1482 fOptions = parent->fOptions; 1483 fAcceptSemaphore = parent->fAcceptSemaphore; 1484 1485 _PrepareReceivePath(segment); 1486 1487 // send SYN+ACK 1488 if (_SendQueued() != B_OK) { 1489 T(Error(this, "sending failed", __LINE__)); 1490 return DROP; 1491 } 1492 1493 segment.flags &= ~TCP_FLAG_SYNCHRONIZE; 1494 // we handled this flag now, it must not be set for further processing 1495 1496 return _Receive(segment, buffer); 1497 } 1498 1499 1500 int32 1501 TCPEndpoint::_ListenReceive(tcp_segment_header& segment, net_buffer* buffer) 1502 { 1503 TRACE("ListenReceive()"); 1504 1505 // Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state, 1506 // but the error behaviour differs 1507 if (segment.flags & TCP_FLAG_RESET) 1508 return DROP; 1509 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) 1510 return DROP | RESET; 1511 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0) 1512 return DROP; 1513 1514 // TODO: drop broadcast/multicast 1515 1516 // spawn new endpoint for accept() 1517 net_socket* newSocket; 1518 if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK) { 1519 T(Error(this, "spawning failed", __LINE__)); 1520 return DROP; 1521 } 1522 1523 return ((TCPEndpoint *)newSocket->first_protocol)->_Spawn(this, 1524 segment, buffer); 1525 } 1526 1527 1528 int32 1529 TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment, 1530 net_buffer *buffer) 1531 { 1532 TRACE("_SynchronizeSentReceive()"); 1533 1534 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0 1535 && (fInitialSendSequence >= segment.acknowledge 1536 || fSendMax < segment.acknowledge)) 1537 return DROP | RESET; 1538 1539 if (segment.flags & TCP_FLAG_RESET) { 1540 _HandleReset(ECONNREFUSED); 1541 return DROP; 1542 } 1543 1544 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0) 1545 return DROP; 1546 1547 fSendUnacknowledged = segment.acknowledge; 1548 _PrepareReceivePath(segment); 1549 1550 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) { 1551 _MarkEstablished(); 1552 } else { 1553 // simultaneous open 1554 fState = SYNCHRONIZE_RECEIVED; 1555 T(State(this)); 1556 } 1557 1558 segment.flags &= ~TCP_FLAG_SYNCHRONIZE; 1559 // we handled this flag now, it must not be set for further processing 1560 1561 return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE; 1562 } 1563 1564 1565 int32 1566 TCPEndpoint::_Receive(tcp_segment_header& segment, net_buffer* buffer) 1567 { 1568 // PAWS processing takes precedence over regular TCP acceptability check 1569 if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0 && (segment.flags & TCP_FLAG_RESET) == 0) { 1570 if ((segment.options & TCP_HAS_TIMESTAMPS) == 0) 1571 return DROP; 1572 if ((int32)(fReceivedTimestamp - segment.timestamp_value) > 0 1573 && (fReceivedTimestamp - segment.timestamp_value) <= INT32_MAX) 1574 return DROP | IMMEDIATE_ACKNOWLEDGE; 1575 } 1576 1577 uint32 advertisedWindow = (uint32)segment.advertised_window 1578 << fSendWindowShift; 1579 size_t segmentLength = buffer->size; 1580 1581 // First, handle the most common case for uni-directional data transfer 1582 // (known as header prediction - the segment must not change the window, 1583 // and must be the expected sequence, and contain no control flags) 1584 1585 if (fState == ESTABLISHED 1586 && segment.AcknowledgeOnly() 1587 && fReceiveNext == segment.sequence 1588 && advertisedWindow > 0 && advertisedWindow == fSendWindow 1589 && fSendNext == fSendMax) { 1590 _UpdateTimestamps(segment, segmentLength); 1591 1592 if (segmentLength == 0) { 1593 // this is a pure acknowledge segment - we're on the sending end 1594 if (fSendUnacknowledged < segment.acknowledge 1595 && fSendMax >= segment.acknowledge) { 1596 _Acknowledged(segment); 1597 return DROP; 1598 } 1599 } else if (segment.acknowledge == fSendUnacknowledged 1600 && fReceiveQueue.IsContiguous() 1601 && fReceiveQueue.Free() >= segmentLength 1602 && (fFlags & FLAG_NO_RECEIVE) == 0) { 1603 if (_AddData(segment, buffer)) 1604 _NotifyReader(); 1605 1606 return KEEP | ((segment.flags & TCP_FLAG_PUSH) != 0 1607 ? IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE); 1608 } 1609 } 1610 1611 // The fast path was not applicable, so we continue with the standard 1612 // processing of the incoming segment 1613 1614 ASSERT(fState != SYNCHRONIZE_SENT && fState != LISTEN); 1615 1616 if (fState != CLOSED && fState != TIME_WAIT) { 1617 // Check sequence number 1618 if (!segment_in_sequence(segment, segmentLength, fReceiveNext, 1619 fReceiveWindow)) { 1620 TRACE(" Receive(): segment out of window, next: %" B_PRIu32 1621 " wnd: %" B_PRIu32, fReceiveNext.Number(), fReceiveWindow); 1622 if ((segment.flags & TCP_FLAG_RESET) != 0) { 1623 // TODO: this doesn't look right - review! 1624 return DROP; 1625 } 1626 return DROP | IMMEDIATE_ACKNOWLEDGE; 1627 } 1628 } 1629 1630 if ((segment.flags & TCP_FLAG_RESET) != 0) { 1631 // Is this a valid reset? 1632 // We generally ignore resets in time wait state (see RFC 1337) 1633 if (fLastAcknowledgeSent <= segment.sequence 1634 && tcp_sequence(segment.sequence) < (fLastAcknowledgeSent 1635 + fReceiveWindow) 1636 && fState != TIME_WAIT) { 1637 status_t error; 1638 if (fState == SYNCHRONIZE_RECEIVED) 1639 error = ECONNREFUSED; 1640 else if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE) 1641 error = ENOTCONN; 1642 else 1643 error = ECONNRESET; 1644 1645 _HandleReset(error); 1646 } 1647 1648 return DROP; 1649 } 1650 1651 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0 1652 || (fState == SYNCHRONIZE_RECEIVED 1653 && (fInitialReceiveSequence > segment.sequence 1654 || ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0 1655 && (fSendUnacknowledged > segment.acknowledge 1656 || fSendMax < segment.acknowledge))))) { 1657 // reset the connection - either the initial SYN was faulty, or we 1658 // received a SYN within the data stream 1659 return DROP | RESET; 1660 } 1661 1662 // TODO: Check this! Why do we advertize a window outside of what we should 1663 // buffer? 1664 fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow); 1665 // the window must not shrink 1666 1667 // trim buffer to be within the receive window 1668 int32 drop = (int32)(fReceiveNext - segment.sequence).Number(); 1669 if (drop > 0) { 1670 if ((uint32)drop > buffer->size 1671 || ((uint32)drop == buffer->size 1672 && (segment.flags & TCP_FLAG_FINISH) == 0)) { 1673 // don't accidently remove a FIN we shouldn't remove 1674 segment.flags &= ~TCP_FLAG_FINISH; 1675 drop = buffer->size; 1676 } 1677 1678 // remove duplicate data at the start 1679 TRACE("* remove %" B_PRId32 " bytes from the start", drop); 1680 gBufferModule->remove_header(buffer, drop); 1681 segment.sequence += drop; 1682 } 1683 1684 int32 action = KEEP; 1685 1686 // immediately acknowledge out-of-order segment to trigger fast-retransmit at the sender 1687 if (drop != 0) 1688 action |= IMMEDIATE_ACKNOWLEDGE; 1689 1690 drop = (int32)(segment.sequence + buffer->size 1691 - (fReceiveNext + fReceiveWindow)).Number(); 1692 if (drop > 0) { 1693 // remove data exceeding our window 1694 if ((uint32)drop >= buffer->size) { 1695 // if we can accept data, or the segment is not what we'd expect, 1696 // drop the segment (an immediate acknowledge is always triggered) 1697 if (fReceiveWindow != 0 || segment.sequence != fReceiveNext) 1698 return DROP | IMMEDIATE_ACKNOWLEDGE; 1699 1700 action |= IMMEDIATE_ACKNOWLEDGE; 1701 } 1702 1703 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1704 // we need to remove the finish, too, as part of the data 1705 drop--; 1706 } 1707 1708 segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH); 1709 TRACE("* remove %" B_PRId32 " bytes from the end", drop); 1710 gBufferModule->remove_trailer(buffer, drop); 1711 } 1712 1713 #ifdef TRACE_TCP 1714 if (advertisedWindow > fSendWindow) { 1715 TRACE(" Receive(): Window update %" B_PRIu32 " -> %" B_PRIu32, 1716 fSendWindow, advertisedWindow); 1717 } 1718 #endif 1719 1720 if (advertisedWindow > fSendWindow) 1721 action |= IMMEDIATE_ACKNOWLEDGE; 1722 1723 fSendWindow = advertisedWindow; 1724 if (advertisedWindow > fSendMaxWindow) 1725 fSendMaxWindow = advertisedWindow; 1726 1727 // Then look at the acknowledgement for any updates 1728 1729 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) { 1730 // process acknowledged data 1731 if (fState == SYNCHRONIZE_RECEIVED) 1732 _MarkEstablished(); 1733 1734 if (fSendMax < segment.acknowledge) 1735 return DROP | IMMEDIATE_ACKNOWLEDGE; 1736 1737 if (segment.acknowledge == fSendUnacknowledged) { 1738 if (buffer->size == 0 && advertisedWindow == fSendWindow 1739 && (segment.flags & TCP_FLAG_FINISH) == 0 && fSendUnacknowledged != fSendMax) { 1740 TRACE("Receive(): duplicate ack!"); 1741 _DuplicateAcknowledge(segment); 1742 } 1743 } else if (segment.acknowledge < fSendUnacknowledged) { 1744 return DROP; 1745 } else { 1746 // this segment acknowledges in flight data 1747 1748 if (fDuplicateAcknowledgeCount >= 3) { 1749 // deflate the window. 1750 if (segment.acknowledge > fRecover) { 1751 uint32 flightSize = (fSendMax - fSendUnacknowledged).Number(); 1752 fCongestionWindow = min_c(fSlowStartThreshold, 1753 max_c(flightSize, fSendMaxSegmentSize) + fSendMaxSegmentSize); 1754 fFlags &= ~FLAG_RECOVERY; 1755 } 1756 } 1757 1758 if (fSendMax == segment.acknowledge) 1759 TRACE("Receive(): all inflight data ack'd!"); 1760 1761 if (segment.acknowledge > fSendQueue.LastSequence() 1762 && fState > ESTABLISHED) { 1763 TRACE("Receive(): FIN has been acknowledged!"); 1764 1765 switch (fState) { 1766 case FINISH_SENT: 1767 fState = FINISH_ACKNOWLEDGED; 1768 T(State(this)); 1769 break; 1770 case CLOSING: 1771 fState = TIME_WAIT; 1772 T(State(this)); 1773 _EnterTimeWait(); 1774 return DROP; 1775 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1776 _Close(); 1777 break; 1778 1779 default: 1780 break; 1781 } 1782 } 1783 1784 if (fState != CLOSED) { 1785 tcp_sequence last = fLastAcknowledgeSent; 1786 _Acknowledged(segment); 1787 // we just sent an acknowledge, remove from action 1788 if (last < fLastAcknowledgeSent) 1789 action &= ~IMMEDIATE_ACKNOWLEDGE; 1790 } 1791 } 1792 } 1793 1794 if (segment.flags & TCP_FLAG_URGENT) { 1795 if (fState == ESTABLISHED || fState == FINISH_SENT 1796 || fState == FINISH_ACKNOWLEDGED) { 1797 // TODO: Handle urgent data: 1798 // - RCV.UP <- max(RCV.UP, SEG.UP) 1799 // - signal the user that urgent data is available (SIGURG) 1800 } 1801 } 1802 1803 bool notify = false; 1804 1805 // The buffer may be freed if its data is added to the queue, so cache 1806 // the size as we still need it later. 1807 uint32 bufferSize = buffer->size; 1808 1809 if ((bufferSize > 0 || (segment.flags & TCP_FLAG_FINISH) != 0) 1810 && _ShouldReceive()) 1811 notify = _AddData(segment, buffer); 1812 else { 1813 if ((fFlags & FLAG_NO_RECEIVE) != 0) 1814 fReceiveNext += buffer->size; 1815 1816 action = (action & ~KEEP) | DROP; 1817 } 1818 1819 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1820 segmentLength++; 1821 if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) { 1822 TRACE("Receive(): peer is finishing connection!"); 1823 fReceiveNext++; 1824 notify = true; 1825 1826 // FIN implies PUSH 1827 fReceiveQueue.SetPushPointer(); 1828 1829 // we'll reply immediately to the FIN if we are not 1830 // transitioning to TIME WAIT so we immediatly ACK it. 1831 action |= IMMEDIATE_ACKNOWLEDGE; 1832 1833 // other side is closing connection; change states 1834 switch (fState) { 1835 case ESTABLISHED: 1836 case SYNCHRONIZE_RECEIVED: 1837 fState = FINISH_RECEIVED; 1838 T(State(this)); 1839 break; 1840 case FINISH_SENT: 1841 // simultaneous close 1842 fState = CLOSING; 1843 T(State(this)); 1844 break; 1845 case FINISH_ACKNOWLEDGED: 1846 fState = TIME_WAIT; 1847 T(State(this)); 1848 _EnterTimeWait(); 1849 break; 1850 case TIME_WAIT: 1851 _UpdateTimeWait(); 1852 break; 1853 1854 default: 1855 break; 1856 } 1857 } 1858 } 1859 1860 if (notify) 1861 _NotifyReader(); 1862 1863 if (bufferSize > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0) 1864 action |= ACKNOWLEDGE; 1865 1866 _UpdateTimestamps(segment, segmentLength); 1867 1868 TRACE("Receive() Action %" B_PRId32, action); 1869 1870 return action; 1871 } 1872 1873 1874 int32 1875 TCPEndpoint::SegmentReceived(tcp_segment_header& segment, net_buffer* buffer) 1876 { 1877 MutexLocker locker(fLock); 1878 1879 TRACE("SegmentReceived(): buffer %p (%" B_PRIu32 " bytes) address %s " 1880 "to %s flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32 1881 ", wnd %" B_PRIu32, buffer, buffer->size, PrintAddress(buffer->source), 1882 PrintAddress(buffer->destination), segment.flags, segment.sequence, 1883 segment.acknowledge, 1884 (uint32)segment.advertised_window << fSendWindowShift); 1885 T(Receive(this, segment, 1886 (uint32)segment.advertised_window << fSendWindowShift, buffer)); 1887 int32 segmentAction = DROP; 1888 1889 switch (fState) { 1890 case LISTEN: 1891 segmentAction = _ListenReceive(segment, buffer); 1892 break; 1893 1894 case SYNCHRONIZE_SENT: 1895 segmentAction = _SynchronizeSentReceive(segment, buffer); 1896 break; 1897 1898 case SYNCHRONIZE_RECEIVED: 1899 case ESTABLISHED: 1900 case FINISH_RECEIVED: 1901 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1902 case FINISH_SENT: 1903 case FINISH_ACKNOWLEDGED: 1904 case CLOSING: 1905 case TIME_WAIT: 1906 case CLOSED: 1907 segmentAction = _Receive(segment, buffer); 1908 break; 1909 } 1910 1911 // process acknowledge action as asked for by the *Receive() method 1912 if (segmentAction & IMMEDIATE_ACKNOWLEDGE) 1913 SendAcknowledge(true); 1914 else if (segmentAction & ACKNOWLEDGE) 1915 DelayedAcknowledge(); 1916 1917 if ((fFlags & (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) 1918 == (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) { 1919 1920 locker.Unlock(); 1921 if (gSocketModule->release_socket(socket)) 1922 segmentAction |= DELETED_ENDPOINT; 1923 } 1924 1925 return segmentAction; 1926 } 1927 1928 1929 // #pragma mark - send 1930 1931 1932 inline uint8 1933 TCPEndpoint::_CurrentFlags() 1934 { 1935 // we don't set FLAG_FINISH here, instead we do it 1936 // conditionally below depending if we are sending 1937 // the last bytes of the send queue. 1938 1939 switch (fState) { 1940 case CLOSED: 1941 return TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE; 1942 1943 case SYNCHRONIZE_SENT: 1944 return TCP_FLAG_SYNCHRONIZE; 1945 case SYNCHRONIZE_RECEIVED: 1946 return TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE; 1947 1948 case ESTABLISHED: 1949 case FINISH_RECEIVED: 1950 case FINISH_ACKNOWLEDGED: 1951 case TIME_WAIT: 1952 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1953 case FINISH_SENT: 1954 case CLOSING: 1955 return TCP_FLAG_ACKNOWLEDGE; 1956 1957 default: 1958 return 0; 1959 } 1960 } 1961 1962 1963 inline bool 1964 TCPEndpoint::_ShouldSendSegment(tcp_segment_header& segment, uint32 length, 1965 uint32 segmentMaxSize, uint32 flightSize) 1966 { 1967 if (fState == ESTABLISHED && fSendMaxSegments == 0) 1968 return false; 1969 1970 if (length > 0) { 1971 // Avoid the silly window syndrome - we only send a segment in case: 1972 // - we have a full segment to send, or 1973 // - we're at the end of our buffer queue, or 1974 // - the buffer is at least larger than half of the maximum send window, 1975 // or 1976 // - we're retransmitting data 1977 if (length == segmentMaxSize 1978 || (fOptions & TCP_NODELAY) != 0 1979 || tcp_sequence(fSendNext + length) == fSendQueue.LastSequence() 1980 || (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2)) 1981 return true; 1982 } 1983 1984 // check if we need to send a window update to the peer 1985 if (segment.advertised_window > 0) { 1986 // correct the window to take into account what already has been advertised 1987 uint32 window = (segment.advertised_window << fReceiveWindowShift) 1988 - (fReceiveMaxAdvertised - fReceiveNext).Number(); 1989 1990 // if we can advertise a window larger than twice the maximum segment 1991 // size, or half the maximum buffer size we send a window update 1992 if (window >= (fReceiveMaxSegmentSize << 1) 1993 || window >= (socket->receive.buffer_size >> 1)) 1994 return true; 1995 } 1996 1997 if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH 1998 | TCP_FLAG_RESET)) != 0) 1999 return true; 2000 2001 // We do have urgent data pending 2002 if (fSendUrgentOffset > fSendNext) 2003 return true; 2004 2005 // there is no reason to send a segment just now 2006 return false; 2007 } 2008 2009 2010 status_t 2011 TCPEndpoint::_SendQueued(bool force) 2012 { 2013 return _SendQueued(force, fSendWindow); 2014 } 2015 2016 2017 /*! Sends one or more TCP segments with the data waiting in the queue, or some 2018 specific flags that need to be sent. 2019 */ 2020 status_t 2021 TCPEndpoint::_SendQueued(bool force, uint32 sendWindow) 2022 { 2023 if (fRoute == NULL) 2024 return B_ERROR; 2025 2026 // in passive state? 2027 if (fState == LISTEN) 2028 return B_ERROR; 2029 2030 tcp_segment_header segment(_CurrentFlags()); 2031 2032 if ((fOptions & TCP_NOOPT) == 0) { 2033 if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0) { 2034 segment.options |= TCP_HAS_TIMESTAMPS; 2035 segment.timestamp_reply = fReceivedTimestamp; 2036 segment.timestamp_value = tcp_now(); 2037 } 2038 2039 // SACK information is embedded with duplicate acknowledgements 2040 if (!fReceiveQueue.IsContiguous() 2041 && fLastAcknowledgeSent <= fReceiveNext 2042 && (fFlags & FLAG_OPTION_SACK_PERMITTED) != 0) { 2043 segment.options |= TCP_HAS_SACK; 2044 int maxSackCount = MAX_SACK_BLKS 2045 - ((fFlags & FLAG_OPTION_TIMESTAMP) != 0); 2046 memset(segment.sacks, 0, sizeof(segment.sacks)); 2047 segment.sackCount = fReceiveQueue.PopulateSackInfo(fReceiveNext, 2048 maxSackCount, segment.sacks); 2049 } 2050 2051 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0 2052 && fSendNext == fInitialSendSequence) { 2053 // add connection establishment options 2054 segment.max_segment_size = fReceiveMaxSegmentSize; 2055 if (fFlags & FLAG_OPTION_WINDOW_SCALE) { 2056 segment.options |= TCP_HAS_WINDOW_SCALE; 2057 segment.window_shift = fReceiveWindowShift; 2058 } 2059 if ((fFlags & FLAG_OPTION_SACK_PERMITTED) != 0) 2060 segment.options |= TCP_SACK_PERMITTED; 2061 } 2062 } 2063 2064 size_t availableBytes = fReceiveQueue.Free(); 2065 // window size must remain same for duplicate acknowledgements 2066 if (!fReceiveQueue.IsContiguous()) 2067 availableBytes = (fReceiveMaxAdvertised - fReceiveNext).Number(); 2068 2069 if (fFlags & FLAG_OPTION_WINDOW_SCALE) 2070 segment.advertised_window = availableBytes >> fReceiveWindowShift; 2071 else 2072 segment.advertised_window = min_c(TCP_MAX_WINDOW, availableBytes); 2073 2074 segment.acknowledge = fReceiveNext.Number(); 2075 2076 // Process urgent data 2077 if (fSendUrgentOffset > fSendNext) { 2078 segment.flags |= TCP_FLAG_URGENT; 2079 segment.urgent_offset = (fSendUrgentOffset - fSendNext).Number(); 2080 } else { 2081 fSendUrgentOffset = fSendUnacknowledged.Number(); 2082 // Keep urgent offset updated, so that it doesn't reach into our 2083 // send window on overlap 2084 segment.urgent_offset = 0; 2085 } 2086 2087 if (fCongestionWindow > 0 && fCongestionWindow < sendWindow) 2088 sendWindow = fCongestionWindow; 2089 2090 // fSendUnacknowledged 2091 // | fSendNext fSendMax 2092 // | | | 2093 // v v v 2094 // ----------------------------------- 2095 // | effective window | 2096 // ----------------------------------- 2097 2098 // Flight size represents the window of data which is currently in the 2099 // ether. We should never send data such as the flight size becomes larger 2100 // than the effective window. Note however that the effective window may be 2101 // reduced (by congestion for instance), so at some point in time flight 2102 // size may be larger than the currently calculated window. 2103 2104 uint32 flightSize = (fSendMax - fSendUnacknowledged).Number(); 2105 uint32 consumedWindow = (fSendNext - fSendUnacknowledged).Number(); 2106 2107 if (consumedWindow > sendWindow) { 2108 sendWindow = 0; 2109 // TODO: enter persist state? try to get a window update. 2110 } else 2111 sendWindow -= consumedWindow; 2112 2113 uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow); 2114 bool shouldStartRetransmitTimer = fSendNext == fSendUnacknowledged; 2115 bool retransmit = fSendNext < fSendMax; 2116 2117 if (fDuplicateAcknowledgeCount != 0) { 2118 // send at most 1 SMSS of data when under limited transmit, fast transmit/recovery 2119 length = min_c(length, fSendMaxSegmentSize); 2120 } 2121 2122 do { 2123 uint32 segmentMaxSize = fSendMaxSegmentSize 2124 - tcp_options_length(segment); 2125 uint32 segmentLength = min_c(length, segmentMaxSize); 2126 2127 if (fSendNext + segmentLength == fSendQueue.LastSequence() && !force) { 2128 if (state_needs_finish(fState)) 2129 segment.flags |= TCP_FLAG_FINISH; 2130 if (length > 0) 2131 segment.flags |= TCP_FLAG_PUSH; 2132 } 2133 2134 // Determine if we should really send this segment 2135 if (!force && !retransmit && !_ShouldSendSegment(segment, segmentLength, 2136 segmentMaxSize, flightSize)) { 2137 if (fSendQueue.Available() 2138 && !gStackModule->is_timer_active(&fPersistTimer) 2139 && !gStackModule->is_timer_active(&fRetransmitTimer)) 2140 _StartPersistTimer(); 2141 break; 2142 } 2143 2144 net_buffer *buffer = gBufferModule->create(256); 2145 if (buffer == NULL) 2146 return B_NO_MEMORY; 2147 2148 status_t status = B_OK; 2149 if (segmentLength > 0) 2150 status = fSendQueue.Get(buffer, fSendNext, segmentLength); 2151 if (status < B_OK) { 2152 gBufferModule->free(buffer); 2153 return status; 2154 } 2155 2156 LocalAddress().CopyTo(buffer->source); 2157 PeerAddress().CopyTo(buffer->destination); 2158 2159 uint32 size = buffer->size; 2160 segment.sequence = fSendNext.Number(); 2161 2162 TRACE("SendQueued(): buffer %p (%" B_PRIu32 " bytes) address %s to " 2163 "%s flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32 2164 ", rwnd %" B_PRIu16 ", cwnd %" B_PRIu32 ", ssthresh %" B_PRIu32 2165 ", len %" B_PRIu32 ", first %" B_PRIu32 ", last %" B_PRIu32, 2166 buffer, buffer->size, PrintAddress(buffer->source), 2167 PrintAddress(buffer->destination), segment.flags, segment.sequence, 2168 segment.acknowledge, segment.advertised_window, 2169 fCongestionWindow, fSlowStartThreshold, segmentLength, 2170 fSendQueue.FirstSequence().Number(), 2171 fSendQueue.LastSequence().Number()); 2172 T(Send(this, segment, buffer, fSendQueue.FirstSequence(), 2173 fSendQueue.LastSequence())); 2174 2175 PROBE(buffer, sendWindow); 2176 sendWindow -= buffer->size; 2177 2178 status = add_tcp_header(AddressModule(), segment, buffer); 2179 if (status != B_OK) { 2180 gBufferModule->free(buffer); 2181 return status; 2182 } 2183 2184 // Update send status - we need to do this before we send the data 2185 // for local connections as the answer is directly handled 2186 2187 if (segment.flags & TCP_FLAG_SYNCHRONIZE) { 2188 segment.options &= ~TCP_HAS_WINDOW_SCALE; 2189 segment.max_segment_size = 0; 2190 size++; 2191 } 2192 2193 if (segment.flags & TCP_FLAG_FINISH) 2194 size++; 2195 2196 uint32 sendMax = fSendMax.Number(); 2197 fSendNext += size; 2198 if (fSendMax < fSendNext) 2199 fSendMax = fSendNext; 2200 2201 fReceiveMaxAdvertised = fReceiveNext 2202 + ((uint32)segment.advertised_window << fReceiveWindowShift); 2203 2204 if (segmentLength != 0 && fState == ESTABLISHED) 2205 --fSendMaxSegments; 2206 2207 status = next->module->send_routed_data(next, fRoute, buffer); 2208 if (status < B_OK) { 2209 gBufferModule->free(buffer); 2210 2211 fSendNext = segment.sequence; 2212 fSendMax = sendMax; 2213 // restore send status 2214 return status; 2215 } 2216 2217 if (fSendTime == 0 && !retransmit 2218 && (segmentLength != 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) !=0)) { 2219 fSendTime = tcp_now(); 2220 fRoundTripStartSequence = segment.sequence; 2221 } 2222 2223 if (shouldStartRetransmitTimer && size > 0) { 2224 TRACE("starting initial retransmit timer of: %" B_PRIdBIGTIME, 2225 fRetransmitTimeout); 2226 gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout); 2227 T(TimerSet(this, "retransmit", fRetransmitTimeout)); 2228 shouldStartRetransmitTimer = false; 2229 } 2230 2231 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) { 2232 fLastAcknowledgeSent = segment.acknowledge; 2233 gStackModule->cancel_timer(&fDelayedAcknowledgeTimer); 2234 } 2235 2236 length -= segmentLength; 2237 segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET 2238 | TCP_FLAG_FINISH); 2239 2240 if (retransmit) 2241 break; 2242 2243 } while (length > 0); 2244 2245 return B_OK; 2246 } 2247 2248 2249 int 2250 TCPEndpoint::_MaxSegmentSize(const sockaddr* address) const 2251 { 2252 return next->module->get_mtu(next, address) - sizeof(tcp_header); 2253 } 2254 2255 2256 status_t 2257 TCPEndpoint::_PrepareSendPath(const sockaddr* peer) 2258 { 2259 if (fRoute == NULL) { 2260 fRoute = gDatalinkModule->get_route(Domain(), peer); 2261 if (fRoute == NULL) 2262 return ENETUNREACH; 2263 2264 if ((fRoute->flags & RTF_LOCAL) != 0) 2265 fFlags |= FLAG_LOCAL; 2266 } 2267 2268 // make sure connection does not already exist 2269 status_t status = fManager->SetConnection(this, *LocalAddress(), peer, 2270 fRoute->interface_address->local); 2271 if (status < B_OK) 2272 return status; 2273 2274 fInitialSendSequence = system_time() >> 4; 2275 fSendNext = fInitialSendSequence; 2276 fSendUnacknowledged = fInitialSendSequence; 2277 fSendMax = fInitialSendSequence; 2278 fSendUrgentOffset = fInitialSendSequence; 2279 fRecover = fInitialSendSequence.Number(); 2280 2281 // we are counting the SYN here 2282 fSendQueue.SetInitialSequence(fSendNext + 1); 2283 2284 fReceiveMaxSegmentSize = _MaxSegmentSize(peer); 2285 2286 // Compute the window shift we advertise to our peer - if it doesn't support 2287 // this option, this will be reset to 0 (when its SYN is received) 2288 fReceiveWindowShift = 0; 2289 while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT 2290 && (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) { 2291 fReceiveWindowShift++; 2292 } 2293 2294 return B_OK; 2295 } 2296 2297 2298 void 2299 TCPEndpoint::_Acknowledged(tcp_segment_header& segment) 2300 { 2301 TRACE("_Acknowledged(): ack %" B_PRIu32 "; uack %" B_PRIu32 "; next %" 2302 B_PRIu32 "; max %" B_PRIu32, segment.acknowledge, 2303 fSendUnacknowledged.Number(), fSendNext.Number(), fSendMax.Number()); 2304 2305 ASSERT(fSendUnacknowledged <= segment.acknowledge); 2306 2307 if (fSendUnacknowledged < segment.acknowledge) { 2308 fSendQueue.RemoveUntil(segment.acknowledge); 2309 2310 uint32 bytesAcknowledged = segment.acknowledge - fSendUnacknowledged.Number(); 2311 fPreviousHighestAcknowledge = fSendUnacknowledged; 2312 fSendUnacknowledged = segment.acknowledge; 2313 uint32 flightSize = (fSendMax - fSendUnacknowledged).Number(); 2314 int32 expectedSamples = flightSize / (fSendMaxSegmentSize << 1); 2315 2316 if (fPreviousHighestAcknowledge > fSendUnacknowledged) { 2317 // need to update the recover variable upon a sequence wraparound 2318 fRecover = segment.acknowledge - 1; 2319 } 2320 2321 // the acknowledgment of the SYN/ACK MUST NOT increase the size of the congestion window 2322 if (fSendUnacknowledged != fInitialSendSequence) { 2323 if (fCongestionWindow < fSlowStartThreshold) 2324 fCongestionWindow += min_c(bytesAcknowledged, fSendMaxSegmentSize); 2325 else { 2326 uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize; 2327 2328 if (increment < fCongestionWindow) 2329 increment = 1; 2330 else 2331 increment /= fCongestionWindow; 2332 2333 fCongestionWindow += increment; 2334 } 2335 2336 fSendMaxSegments = UINT32_MAX; 2337 } 2338 2339 if ((fFlags & FLAG_RECOVERY) != 0) { 2340 fSendNext = fSendUnacknowledged; 2341 _SendQueued(); 2342 fCongestionWindow -= bytesAcknowledged; 2343 2344 if (bytesAcknowledged > fSendMaxSegmentSize) 2345 fCongestionWindow += fSendMaxSegmentSize; 2346 2347 fSendNext = fSendMax; 2348 } else 2349 fDuplicateAcknowledgeCount = 0; 2350 2351 if (fSendNext < fSendUnacknowledged) 2352 fSendNext = fSendUnacknowledged; 2353 2354 if (fFlags & FLAG_OPTION_TIMESTAMP) { 2355 _UpdateRoundTripTime(tcp_diff_timestamp(segment.timestamp_reply), 2356 expectedSamples > 0 ? expectedSamples : 1); 2357 } else if (fSendTime != 0 && fRoundTripStartSequence < segment.acknowledge) { 2358 _UpdateRoundTripTime(tcp_diff_timestamp(fSendTime), 1); 2359 fSendTime = 0; 2360 } 2361 2362 if (fSendUnacknowledged == fSendMax) { 2363 TRACE("all acknowledged, cancelling retransmission timer."); 2364 gStackModule->cancel_timer(&fRetransmitTimer); 2365 T(TimerSet(this, "retransmit", -1)); 2366 } else { 2367 TRACE("data acknowledged, resetting retransmission timer to: %" 2368 B_PRIdBIGTIME, fRetransmitTimeout); 2369 gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout); 2370 T(TimerSet(this, "retransmit", fRetransmitTimeout)); 2371 } 2372 2373 if (is_writable(fState)) { 2374 // notify threads waiting on the socket to become writable again 2375 fSendCondition.NotifyAll(); 2376 gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Free()); 2377 } 2378 } 2379 2380 // if there is data left to be sent, send it now 2381 if (fSendQueue.Used() > 0) 2382 _SendQueued(); 2383 } 2384 2385 2386 void 2387 TCPEndpoint::_Retransmit() 2388 { 2389 TRACE("Retransmit()"); 2390 2391 if (fState < ESTABLISHED) { 2392 fRetransmitTimeout = TCP_SYN_RETRANSMIT_TIMEOUT; 2393 fCongestionWindow = fSendMaxSegmentSize; 2394 } else { 2395 _ResetSlowStart(); 2396 fDuplicateAcknowledgeCount = 0; 2397 // Do exponential back off of the retransmit timeout 2398 fRetransmitTimeout *= 2; 2399 if (fRetransmitTimeout > TCP_MAX_RETRANSMIT_TIMEOUT) 2400 fRetransmitTimeout = TCP_MAX_RETRANSMIT_TIMEOUT; 2401 } 2402 2403 fSendNext = fSendUnacknowledged; 2404 _SendQueued(); 2405 2406 fRecover = fSendNext.Number() - 1; 2407 if ((fFlags & FLAG_RECOVERY) != 0) 2408 fFlags &= ~FLAG_RECOVERY; 2409 } 2410 2411 2412 void 2413 TCPEndpoint::_UpdateRoundTripTime(int32 roundTripTime, int32 expectedSamples) 2414 { 2415 if (fSmoothedRoundTripTime == 0) { 2416 fSmoothedRoundTripTime = roundTripTime; 2417 fRoundTripVariation = roundTripTime / 2; 2418 fRetransmitTimeout = (fSmoothedRoundTripTime + max_c(100, fRoundTripVariation * 4)) 2419 * kTimestampFactor; 2420 } else { 2421 int32 delta = fSmoothedRoundTripTime - roundTripTime; 2422 if (delta < 0) 2423 delta = -delta; 2424 fRoundTripVariation += (delta - fRoundTripVariation) / (expectedSamples * 4); 2425 fSmoothedRoundTripTime += (roundTripTime - fSmoothedRoundTripTime) / (expectedSamples * 8); 2426 fRetransmitTimeout = (fSmoothedRoundTripTime + max_c(100, fRoundTripVariation * 4)) 2427 * kTimestampFactor; 2428 } 2429 2430 if (fRetransmitTimeout > TCP_MAX_RETRANSMIT_TIMEOUT) 2431 fRetransmitTimeout = TCP_MAX_RETRANSMIT_TIMEOUT; 2432 2433 if (fRetransmitTimeout < TCP_MIN_RETRANSMIT_TIMEOUT) 2434 fRetransmitTimeout = TCP_MIN_RETRANSMIT_TIMEOUT; 2435 2436 TRACE(" RTO is now %" B_PRIdBIGTIME " (after rtt %" B_PRId32 "ms)", 2437 fRetransmitTimeout, roundTripTime); 2438 } 2439 2440 2441 void 2442 TCPEndpoint::_ResetSlowStart() 2443 { 2444 fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged).Number() / 2, 2445 2 * fSendMaxSegmentSize); 2446 fCongestionWindow = fSendMaxSegmentSize; 2447 } 2448 2449 2450 // #pragma mark - timer 2451 2452 2453 /*static*/ void 2454 TCPEndpoint::_RetransmitTimer(net_timer* timer, void* _endpoint) 2455 { 2456 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2457 T(TimerTriggered(endpoint, "retransmit")); 2458 2459 MutexLocker locker(endpoint->fLock); 2460 if (!locker.IsLocked() || gStackModule->is_timer_active(timer)) 2461 return; 2462 2463 endpoint->_Retransmit(); 2464 } 2465 2466 2467 /*static*/ void 2468 TCPEndpoint::_PersistTimer(net_timer* timer, void* _endpoint) 2469 { 2470 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2471 T(TimerTriggered(endpoint, "persist")); 2472 2473 MutexLocker locker(endpoint->fLock); 2474 if (!locker.IsLocked()) 2475 return; 2476 2477 // the timer might not have been canceled early enough 2478 if (endpoint->State() == CLOSED) 2479 return; 2480 2481 endpoint->_SendQueued(true); 2482 } 2483 2484 2485 /*static*/ void 2486 TCPEndpoint::_DelayedAcknowledgeTimer(net_timer* timer, void* _endpoint) 2487 { 2488 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2489 T(TimerTriggered(endpoint, "delayed ack")); 2490 2491 MutexLocker locker(endpoint->fLock); 2492 if (!locker.IsLocked()) 2493 return; 2494 2495 // the timer might not have been canceled early enough 2496 if (endpoint->State() == CLOSED) 2497 return; 2498 2499 endpoint->SendAcknowledge(true); 2500 } 2501 2502 2503 /*static*/ void 2504 TCPEndpoint::_TimeWaitTimer(net_timer* timer, void* _endpoint) 2505 { 2506 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2507 T(TimerTriggered(endpoint, "time-wait")); 2508 2509 MutexLocker locker(endpoint->fLock); 2510 if (!locker.IsLocked()) 2511 return; 2512 2513 if ((endpoint->fFlags & FLAG_CLOSED) == 0) { 2514 endpoint->fFlags |= FLAG_DELETE_ON_CLOSE; 2515 return; 2516 } 2517 2518 locker.Unlock(); 2519 2520 gSocketModule->release_socket(endpoint->socket); 2521 } 2522 2523 2524 /*static*/ status_t 2525 TCPEndpoint::_WaitForCondition(ConditionVariable& condition, 2526 MutexLocker& locker, bigtime_t timeout) 2527 { 2528 ConditionVariableEntry entry; 2529 condition.Add(&entry); 2530 2531 locker.Unlock(); 2532 status_t result = entry.Wait(B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT, timeout); 2533 locker.Lock(); 2534 2535 return result; 2536 } 2537 2538 2539 // #pragma mark - 2540 2541 2542 void 2543 TCPEndpoint::Dump() const 2544 { 2545 kprintf("TCP endpoint %p\n", this); 2546 kprintf(" state: %s\n", name_for_state(fState)); 2547 kprintf(" flags: 0x%" B_PRIx32 "\n", fFlags); 2548 #if KDEBUG 2549 kprintf(" lock: { %p, holder: %" B_PRId32 " }\n", &fLock, fLock.holder); 2550 #endif 2551 kprintf(" accept sem: %" B_PRId32 "\n", fAcceptSemaphore); 2552 kprintf(" options: 0x%" B_PRIx32 "\n", (uint32)fOptions); 2553 kprintf(" send\n"); 2554 kprintf(" window shift: %" B_PRIu8 "\n", fSendWindowShift); 2555 kprintf(" unacknowledged: %" B_PRIu32 "\n", 2556 fSendUnacknowledged.Number()); 2557 kprintf(" next: %" B_PRIu32 "\n", fSendNext.Number()); 2558 kprintf(" max: %" B_PRIu32 "\n", fSendMax.Number()); 2559 kprintf(" urgent offset: %" B_PRIu32 "\n", fSendUrgentOffset.Number()); 2560 kprintf(" window: %" B_PRIu32 "\n", fSendWindow); 2561 kprintf(" max window: %" B_PRIu32 "\n", fSendMaxWindow); 2562 kprintf(" max segment size: %" B_PRIu32 "\n", fSendMaxSegmentSize); 2563 kprintf(" queue: %" B_PRIuSIZE " / %" B_PRIuSIZE "\n", fSendQueue.Used(), 2564 fSendQueue.Size()); 2565 #if DEBUG_TCP_BUFFER_QUEUE 2566 fSendQueue.Dump(); 2567 #endif 2568 kprintf(" last acknowledge sent: %" B_PRIu32 "\n", 2569 fLastAcknowledgeSent.Number()); 2570 kprintf(" initial sequence: %" B_PRIu32 "\n", 2571 fInitialSendSequence.Number()); 2572 kprintf(" receive\n"); 2573 kprintf(" window shift: %" B_PRIu8 "\n", fReceiveWindowShift); 2574 kprintf(" next: %" B_PRIu32 "\n", fReceiveNext.Number()); 2575 kprintf(" max advertised: %" B_PRIu32 "\n", 2576 fReceiveMaxAdvertised.Number()); 2577 kprintf(" window: %" B_PRIu32 "\n", fReceiveWindow); 2578 kprintf(" max segment size: %" B_PRIu32 "\n", fReceiveMaxSegmentSize); 2579 kprintf(" queue: %" B_PRIuSIZE " / %" B_PRIuSIZE "\n", 2580 fReceiveQueue.Available(), fReceiveQueue.Size()); 2581 #if DEBUG_TCP_BUFFER_QUEUE 2582 fReceiveQueue.Dump(); 2583 #endif 2584 kprintf(" initial sequence: %" B_PRIu32 "\n", 2585 fInitialReceiveSequence.Number()); 2586 kprintf(" duplicate acknowledge count: %" B_PRIu32 "\n", 2587 fDuplicateAcknowledgeCount); 2588 kprintf(" smoothed round trip time: %" B_PRId32 " (deviation %" B_PRId32 ")\n", 2589 fSmoothedRoundTripTime, fRoundTripVariation); 2590 kprintf(" retransmit timeout: %" B_PRId64 "\n", fRetransmitTimeout); 2591 kprintf(" congestion window: %" B_PRIu32 "\n", fCongestionWindow); 2592 kprintf(" slow start threshold: %" B_PRIu32 "\n", fSlowStartThreshold); 2593 } 2594 2595